From 31def12e2629f187e40f93f6bae9e26a6c2660e2 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 30 Jun 2016 15:19:20 -0500 Subject: [PATCH 01/27] First phase of control tree redesign. Details: - These changes constitute the first set of changes in preparation to revamping the structure and use of control trees in BLIS. Modifications in this commit don't affect the control tree code yet, but rather lay the groundwork. - Defined wrappers for the following functions, where the the wrappers each take a direction parameter of a new enumerated type (BLIS_BWD or BLIS_FWD), dir_t, and executes the correct underlying function. - bli_acquire_mpart_*() and _vpart_*() - bli_*_determine_kc_[fb]() - bli_thread_get_range_*() and bli_thread_get_range_weighted_*() - Consolidated all 'f' (forwards-moving) and 'b' (backwards-moving) blocked variants for trmm and trsm, and renamed gemm and herk variants accordingly. The direction is now queried via routines such as bli_trmm_direct(), which deterines the direction from the implied side and uplo parameters. For gemm and herk, it is uncondtionally BLIS_FWD. - Defined wrappers to parameter-specific macrokernels for herk, trmm, and trsm, e.g. bli_trmm_xx_ker_var2(), that execute the correct underlying macrokernel based on the implied parameters. The same logic used to choose the dir_t in _direct() functions is used here. - Simplified the function pointer arrays in _int() functions given the consolidation and dir_t querying mentioned above. - Function signature (whitespace) reformatting for various functions. - Removed old code in various 'old' directories. --- frame/3/bli_l3.h | 4 + frame/3/bli_l3_blocksize.c | 61 +++-- frame/3/bli_l3_blocksize.h | 31 ++- .../old/bli_trsm_cntx.c => bli_l3_direct.c} | 108 ++++++--- .../bli_gemm_blk_var4f.h => bli_l3_direct.h} | 21 +- .../bli_herk_blk_var2f.h => bli_l3_var_oft.h} | 48 +++- ...i_gemm_blk_var1f.c => bli_gemm_blk_var1.c} | 34 +-- ...i_gemm_blk_var2f.c => bli_gemm_blk_var2.c} | 33 +-- ...i_gemm_blk_var3f.c => bli_gemm_blk_var3.c} | 32 ++- frame/3/gemm/bli_gemm_front.c | 17 +- frame/3/gemm/bli_gemm_front.h | 18 +- frame/3/gemm/bli_gemm_int.c | 70 +++--- frame/3/gemm/bli_gemm_int.h | 19 +- frame/3/gemm/bli_gemm_ker_var2.c | 15 +- frame/3/gemm/bli_gemm_var.h | 12 +- ...i_gemm_blk_var4f.c => bli_gemm_blk_var4.c} | 15 +- .../ind/bli_gemm_blk_var4.h} | 14 +- frame/3/gemm/ind/bli_gemm_ker_var3.c | 15 +- frame/3/gemm/ind/bli_gemm_ker_var3.h | 15 +- frame/3/gemm/ind/bli_gemm_ker_var4.c | 15 +- frame/3/gemm/ind/bli_gemm_ker_var4.h | 15 +- frame/3/hemm/bli_hemm_front.c | 19 +- frame/3/hemm/bli_hemm_front.h | 20 +- frame/3/her2k/bli_her2k_front.c | 17 +- frame/3/her2k/bli_her2k_front.h | 18 +- ...i_herk_blk_var1f.c => bli_herk_blk_var1.c} | 34 +-- ...i_herk_blk_var2f.c => bli_herk_blk_var2.c} | 34 +-- ...i_herk_blk_var3f.c => bli_herk_blk_var3.c} | 32 ++- frame/3/herk/bli_herk_front.c | 15 +- frame/3/herk/bli_herk_front.h | 16 +- frame/3/herk/bli_herk_int.c | 81 ++++--- frame/3/herk/bli_herk_int.h | 20 +- frame/3/herk/bli_herk_l_ker_var2.c | 15 +- frame/3/herk/bli_herk_u_ker_var2.c | 15 +- frame/3/herk/bli_herk_var.h | 7 +- ...herk_blk_var3f.h => bli_herk_x_ker_var2.c} | 44 +++- frame/3/herk/old/bli_herk_blk_var1f.h | 41 ---- frame/3/herk/old/bli_herk_l_ker_var2.h | 73 ------ frame/3/herk/old/bli_herk_thread.c | 150 ------------- frame/3/herk/old/bli_herk_u_ker_var2.h | 73 ------ .../bli_herk_direct.c} | 16 +- frame/3/old/bli_herk_direct.h | 40 ++++ .../bli_trmm_direct.c} | 29 ++- .../bli_trsm_cntx.h => old/bli_trmm_direct.h} | 8 +- .../bli_trsm_direct.c} | 27 ++- .../bli_trsm_direct.h} | 11 +- frame/3/symm/bli_symm_front.c | 19 +- frame/3/symm/bli_symm_front.h | 20 +- frame/3/syr2k/bli_syr2k_front.c | 17 +- frame/3/syr2k/bli_syr2k_front.h | 18 +- frame/3/syrk/bli_syrk_front.c | 15 +- frame/3/syrk/bli_syrk_front.h | 16 +- frame/3/trmm/bli_trmm_blk_var1.c | 157 +++++++++++++ frame/3/trmm/bli_trmm_blk_var2.c | 156 +++++++++++++ frame/3/trmm/bli_trmm_blk_var3.c | 160 ++++++++++++++ frame/3/trmm/bli_trmm_front.c | 15 +- frame/3/trmm/bli_trmm_front.h | 16 +- frame/3/trmm/bli_trmm_int.c | 102 ++++----- frame/3/trmm/bli_trmm_int.h | 20 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 15 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 15 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 15 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 15 +- frame/3/trmm/bli_trmm_var.h | 10 +- ...i_trmm_thread.h => bli_trmm_xx_ker_var2.c} | 58 ++++- frame/3/trmm/{ => old}/bli_trmm_blk_var1f.c | 0 frame/3/trmm/old/bli_trmm_blk_var1f.h | 41 ---- frame/3/trmm/{ => old}/bli_trmm_blk_var2b.c | 0 frame/3/trmm/old/bli_trmm_blk_var2b.h | 41 ---- frame/3/trmm/{ => old}/bli_trmm_blk_var2f.c | 0 frame/3/trmm/old/bli_trmm_blk_var2f.h | 41 ---- frame/3/trmm/{ => old}/bli_trmm_blk_var3b.c | 0 frame/3/trmm/old/bli_trmm_blk_var3b.h | 41 ---- frame/3/trmm/{ => old}/bli_trmm_blk_var3f.c | 0 frame/3/trmm/old/bli_trmm_blk_var3f.h | 41 ---- frame/3/trmm/old/bli_trmm_ll_ker_var2.h | 71 ------ frame/3/trmm/old/bli_trmm_lu_ker_var2.h | 71 ------ frame/3/trmm/old/bli_trmm_rl_ker_var2.h | 71 ------ frame/3/trmm/old/bli_trmm_ru_ker_var2.h | 71 ------ frame/3/trmm/old/bli_trmm_thread.c | 156 ------------- frame/3/trmm3/bli_trmm3_front.c | 19 +- frame/3/trmm3/bli_trmm3_front.h | 19 +- ...i_trsm_blk_var1f.c => bli_trsm_blk_var1.c} | 34 +-- ...i_trsm_blk_var2f.c => bli_trsm_blk_var2.c} | 34 +-- ...i_trsm_blk_var3f.c => bli_trsm_blk_var3.c} | 32 ++- frame/3/trsm/bli_trsm_front.c | 17 +- frame/3/trsm/bli_trsm_front.h | 18 +- frame/3/trsm/bli_trsm_int.c | 103 ++++----- frame/3/trsm/bli_trsm_int.h | 20 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 15 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 15 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 15 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 15 +- frame/3/trsm/bli_trsm_var.h | 10 +- frame/3/trsm/bli_trsm_xx_ker_var2.c | 87 ++++++++ frame/3/trsm/{ => old}/bli_trsm_blk_var1b.c | 0 frame/3/trsm/old/bli_trsm_blk_var1b.h | 40 ---- frame/3/trsm/{ => old}/bli_trsm_blk_var2b.c | 0 frame/3/trsm/{ => old}/bli_trsm_blk_var3b.c | 0 frame/3/trsm/old/bli_trsm_blk_var3b.h | 40 ---- frame/3/trsm/old/bli_trsm_blk_var3f.h | 40 ---- frame/3/trsm/old/bli_trsm_ll_ker_var2.h | 71 ------ frame/3/trsm/old/bli_trsm_lu_ker_var2.h | 71 ------ frame/3/trsm/old/bli_trsm_rl_ker_var2.h | 71 ------ frame/3/trsm/old/bli_trsm_ru_ker_var2.h | 71 ------ frame/3/trsm/old/bli_trsm_thread.c | 169 -------------- frame/base/bli_blksz.c | 136 ++++++++---- frame/base/bli_blksz.h | 96 +++++--- frame/base/bli_part.c | 208 +++++++++++------- frame/base/bli_part.h | 75 +++---- frame/include/bli_type_defs.h | 9 + frame/thread/bli_thread.c | 171 +++++++++++--- frame/thread/bli_thread.h | 22 +- 113 files changed, 2197 insertions(+), 2517 deletions(-) rename frame/3/{trsm/old/bli_trsm_cntx.c => bli_l3_direct.c} (52%) rename frame/3/{gemm/ind/bli_gemm_blk_var4f.h => bli_l3_direct.h} (87%) rename frame/3/{herk/old/bli_herk_blk_var2f.h => bli_l3_var_oft.h} (74%) rename frame/3/gemm/{bli_gemm_blk_var1f.c => bli_gemm_blk_var1.c} (88%) rename frame/3/gemm/{bli_gemm_blk_var2f.c => bli_gemm_blk_var2.c} (88%) rename frame/3/gemm/{bli_gemm_blk_var3f.c => bli_gemm_blk_var3.c} (89%) rename frame/3/gemm/ind/{bli_gemm_blk_var4f.c => bli_gemm_blk_var4.c} (97%) rename frame/3/{trsm/old/bli_trsm_blk_var1f.h => gemm/ind/bli_gemm_blk_var4.h} (89%) rename frame/3/herk/{bli_herk_blk_var1f.c => bli_herk_blk_var1.c} (88%) rename frame/3/herk/{bli_herk_blk_var2f.c => bli_herk_blk_var2.c} (88%) rename frame/3/herk/{bli_herk_blk_var3f.c => bli_herk_blk_var3.c} (89%) rename frame/3/herk/{old/bli_herk_blk_var3f.h => bli_herk_x_ker_var2.c} (73%) delete mode 100644 frame/3/herk/old/bli_herk_blk_var1f.h delete mode 100644 frame/3/herk/old/bli_herk_l_ker_var2.h delete mode 100644 frame/3/herk/old/bli_herk_thread.c delete mode 100644 frame/3/herk/old/bli_herk_u_ker_var2.h rename frame/3/{trsm/old/bli_trsm_blk_var2b.h => old/bli_herk_direct.c} (89%) create mode 100644 frame/3/old/bli_herk_direct.h rename frame/3/{herk/old/bli_herk_thread.h => old/bli_trmm_direct.c} (78%) rename frame/3/{trsm/old/bli_trsm_cntx.h => old/bli_trmm_direct.h} (95%) rename frame/3/{trsm/old/bli_trsm_thread.h => old/bli_trsm_direct.c} (78%) rename frame/3/{trsm/old/bli_trsm_blk_var2f.h => old/bli_trsm_direct.h} (89%) create mode 100644 frame/3/trmm/bli_trmm_blk_var1.c create mode 100644 frame/3/trmm/bli_trmm_blk_var2.c create mode 100644 frame/3/trmm/bli_trmm_blk_var3.c rename frame/3/trmm/{old/bli_trmm_thread.h => bli_trmm_xx_ker_var2.c} (61%) rename frame/3/trmm/{ => old}/bli_trmm_blk_var1f.c (100%) delete mode 100644 frame/3/trmm/old/bli_trmm_blk_var1f.h rename frame/3/trmm/{ => old}/bli_trmm_blk_var2b.c (100%) delete mode 100644 frame/3/trmm/old/bli_trmm_blk_var2b.h rename frame/3/trmm/{ => old}/bli_trmm_blk_var2f.c (100%) delete mode 100644 frame/3/trmm/old/bli_trmm_blk_var2f.h rename frame/3/trmm/{ => old}/bli_trmm_blk_var3b.c (100%) delete mode 100644 frame/3/trmm/old/bli_trmm_blk_var3b.h rename frame/3/trmm/{ => old}/bli_trmm_blk_var3f.c (100%) delete mode 100644 frame/3/trmm/old/bli_trmm_blk_var3f.h delete mode 100644 frame/3/trmm/old/bli_trmm_ll_ker_var2.h delete mode 100644 frame/3/trmm/old/bli_trmm_lu_ker_var2.h delete mode 100644 frame/3/trmm/old/bli_trmm_rl_ker_var2.h delete mode 100644 frame/3/trmm/old/bli_trmm_ru_ker_var2.h delete mode 100644 frame/3/trmm/old/bli_trmm_thread.c rename frame/3/trsm/{bli_trsm_blk_var1f.c => bli_trsm_blk_var1.c} (86%) rename frame/3/trsm/{bli_trsm_blk_var2f.c => bli_trsm_blk_var2.c} (88%) rename frame/3/trsm/{bli_trsm_blk_var3f.c => bli_trsm_blk_var3.c} (89%) create mode 100644 frame/3/trsm/bli_trsm_xx_ker_var2.c rename frame/3/trsm/{ => old}/bli_trsm_blk_var1b.c (100%) delete mode 100644 frame/3/trsm/old/bli_trsm_blk_var1b.h rename frame/3/trsm/{ => old}/bli_trsm_blk_var2b.c (100%) rename frame/3/trsm/{ => old}/bli_trsm_blk_var3b.c (100%) delete mode 100644 frame/3/trsm/old/bli_trsm_blk_var3b.h delete mode 100644 frame/3/trsm/old/bli_trsm_blk_var3f.h delete mode 100644 frame/3/trsm/old/bli_trsm_ll_ker_var2.h delete mode 100644 frame/3/trsm/old/bli_trsm_lu_ker_var2.h delete mode 100644 frame/3/trsm/old/bli_trsm_rl_ker_var2.h delete mode 100644 frame/3/trsm/old/bli_trsm_ru_ker_var2.h delete mode 100644 frame/3/trsm/old/bli_trsm_thread.c diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 9f17349af..13111fd60 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -39,6 +39,7 @@ #include "bli_l3_oft.h" #include "bli_l3_blocksize.h" +#include "bli_l3_direct.h" #include "bli_l3_prune.h" // Prototype object APIs with and without contexts. @@ -67,3 +68,6 @@ #include "bli_trmm3.h" #include "bli_trsm.h" +// Variant object function pointer types. +#include "bli_l3_var_oft.h" + diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 97556dedd..950f13974 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -34,18 +34,43 @@ #include "blis.h" +#undef GENFRONT +#define GENFRONT( opname, l3op ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ) \ +{ \ + if ( direct == BLIS_FWD ) \ + return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \ + else \ + return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \ +} + +GENFRONT( gemm_determine_kc, gemm ) +GENFRONT( trmm_determine_kc, trmm ) +GENFRONT( trsm_determine_kc, trsm ) + +// ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -105,12 +130,12 @@ GENFRONT( gemm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -164,12 +189,12 @@ GENFRONT( trmm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 01e10c3fe..5898186b1 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -38,12 +38,31 @@ \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ); + +GENPROT( gemm_determine_kc ) +GENPROT( trmm_determine_kc ) +GENPROT( trsm_determine_kc ) + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) diff --git a/frame/3/trsm/old/bli_trsm_cntx.c b/frame/3/bli_l3_direct.c similarity index 52% rename from frame/3/trsm/old/bli_trsm_cntx.c rename to frame/3/bli_l3_direct.c index 186c146df..3d30cea9e 100644 --- a/frame/3/trsm/old/bli_trsm_cntx.c +++ b/frame/3/bli_l3_direct.c @@ -34,43 +34,85 @@ #include "blis.h" -void bli_trsm_cntx_init( cntx_t* cntx ) +dir_t bli_gemm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) { - // Perform basic setup on the context. - bli_cntx_obj_create( cntx ); + // For gemm, movement may be forwards (or backwards). - // Initialize the context with the current architecture's native - // level-3 gemm micro-kernel, and its output preferences. - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); - - // Initialize the context with the current architecture's native - // level-3 trsm micro-kernels. - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMMTRSM_L_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMMTRSM_U_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_L_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx ); - - // Initialize the context with the current architecture's register - // and cache blocksizes (and multiples), given the execution method. - bli_gks_cntx_set_blkszs( BLIS_NAT, 6, - BLIS_NC, BLIS_NR, - BLIS_KC, BLIS_KR, - BLIS_MC, BLIS_MR, - BLIS_NR, BLIS_NR, - BLIS_MR, BLIS_MR, - BLIS_KR, BLIS_KR, - cntx ); - - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + return BLIS_FWD; } -void bli_trsm_cntx_finalize( cntx_t* cntx ) +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) { - // Free the context and all memory allocated to it. - bli_cntx_obj_free( cntx ); + // For herk, movement may be forwards (or backwards). + + return BLIS_FWD; +} + +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + // For trmm, movement for the parameter cases is as follows: + // - left,lower: backwards + // - left,upper: forwards + // - right,lower: forwards + // - right,upper: backwards + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + + return direct; +} + +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + // For trsm, movement for the parameter cases is as follows: + // - left,lower: forwards + // - left,upper: backwards + // - right,lower: backwards + // - right,upper: forwards + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + + return direct; } diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4f.h b/frame/3/bli_l3_direct.h similarity index 87% rename from frame/3/gemm/ind/bli_gemm_blk_var4f.h rename to frame/3/bli_l3_direct.h index 289e76550..28c60c428 100644 --- a/frame/3/gemm/ind/bli_gemm_blk_var4f.h +++ b/frame/3/bli_l3_direct.h @@ -32,10 +32,19 @@ */ -void bli_gemm_blk_var4f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); + +#undef GENPROT +#define GENPROT( opname ) \ +\ +dir_t PASTEMAC0(opname) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ); + +GENPROT( gemm_direct ) +GENPROT( herk_direct ) +GENPROT( trmm_direct ) +GENPROT( trsm_direct ) diff --git a/frame/3/herk/old/bli_herk_blk_var2f.h b/frame/3/bli_l3_var_oft.h similarity index 74% rename from frame/3/herk/old/bli_herk_blk_var2f.h rename to frame/3/bli_l3_var_oft.h index f436a0082..ef48d5e85 100644 --- a/frame/3/herk/old/bli_herk_blk_var2f.h +++ b/frame/3/bli_l3_var_oft.h @@ -32,10 +32,46 @@ */ -void bli_herk_blk_var2f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); +#ifndef BLIS_L3_VAR_OFT_H +#define BLIS_L3_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + gemm_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( gemm ) + + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + trsm_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( trsm ) + + + +#endif diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1.c similarity index 88% rename from frame/3/gemm/bli_gemm_blk_var1f.c rename to frame/3/gemm/bli_gemm_blk_var1.c index ee4a6a763..4a0d00c11 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_gemm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { //The s is for "lives on the stack" obj_t b_pack_s; @@ -50,9 +53,14 @@ void bli_gemm_blk_var1f( obj_t* a, obj_t* b_pack = NULL; obj_t* c1_pack = NULL; + dir_t direct; + dim_t i; dim_t b_alg; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_gemm_direct( a, b, c ); + if( bli_thread_am_ochief( thread ) ) { // Initialize object for packing B. bli_obj_init_pack( &b_pack_s ); @@ -81,7 +89,7 @@ void bli_gemm_blk_var1f( obj_t* a, bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, + bli_thread_get_range_mdim( direct, thread, a, bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), &my_start, &my_end ); @@ -92,14 +100,14 @@ void bli_gemm_blk_var1f( obj_t* a, // NOTE: Use of a (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2.c similarity index 88% rename from frame/3/gemm/bli_gemm_blk_var2f.c rename to frame/3/gemm/bli_gemm_blk_var2.c index f44951a20..b27d70a2f 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_gemm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; @@ -49,9 +52,13 @@ void bli_gemm_blk_var2f( obj_t* a, obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; + dir_t direct; + dim_t i; dim_t b_alg; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_gemm_direct( a, b, c ); if( bli_thread_am_ochief( thread ) ) { // Initialize object for packing A @@ -80,7 +87,7 @@ void bli_gemm_blk_var2f( obj_t* a, bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; - bli_thread_get_range_l2r( thread, b, + bli_thread_get_range_ndim( direct, thread, b, bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), &my_start, &my_end ); @@ -91,14 +98,14 @@ void bli_gemm_blk_var2f( obj_t* a, // NOTE: Use of b (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, b, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3.c similarity index 89% rename from frame/3/gemm/bli_gemm_blk_var3f.c rename to frame/3/gemm/bli_gemm_blk_var3.c index 073760900..ad5a92ffc 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_gemm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t c_pack_s; obj_t a1_pack_s, b1_pack_s; @@ -49,10 +52,15 @@ void bli_gemm_blk_var3f( obj_t* a, obj_t* b1_pack = NULL; obj_t* c_pack = NULL; + dir_t direct; + dim_t i; dim_t b_alg; dim_t k_trans; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_gemm_direct( a, b, c ); + if( bli_thread_am_ochief( thread ) ){ // Initialize object for packing C bli_obj_init_pack( &c_pack_s ); @@ -89,14 +97,14 @@ void bli_gemm_blk_var3f( obj_t* a, // NOTE: We call a gemm/hemm/symm-specific function to determine // the kc blocksize so that we can implement the "nudging" of kc // to be a multiple of mr or nr, as needed. - b_alg = bli_gemm_determine_kc_f( i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_gemm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 47b5573c4..a2c7be14f 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_gemm_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_gemm_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t a_local; obj_t b_local; diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index 0176eef37..fc554196b 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -32,11 +32,13 @@ */ -void bli_gemm_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_gemm_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 171f2d6f1..88324705f 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -34,41 +34,35 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[6][3] = +#if 1 +static gemm_voft vars[4][3] = { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_gemm_blk_var1f }, - { NULL, bli_gemm_ker_var2, bli_gemm_blk_var2f }, - { NULL, NULL, bli_gemm_blk_var3f }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL }, - { NULL, NULL, NULL } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_gemm_blk_var1 }, + { NULL, bli_gemm_ker_var2, bli_gemm_blk_var2 }, + { NULL, NULL, bli_gemm_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_gemm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; varnum_t n; impl_t i; - FUNCPTR_T f; + gemm_voft f; ind_t im; // Check parameters. @@ -140,22 +134,28 @@ void bli_gemm_int( obj_t* alpha, // Index into the variant array to extract the correct function pointer. f = vars[n][i]; + // Extract the function pointer from the current control tree node. + //f = bli_cntl_sub_prob( cntl ); + // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { - if ( im == BLIS_3M3 && f == bli_gemm_blk_var1f ) f = bli_gemm_blk_var4f; - else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var4; - else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var3; + if ( im == BLIS_3M3 && f == bli_gemm_blk_var1 ) f = bli_gemm_blk_var4; + else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var4; + else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var3; } // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/gemm/bli_gemm_int.h index 9177122fd..73e44fecf 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/gemm/bli_gemm_int.h @@ -32,12 +32,15 @@ */ -void bli_gemm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 036876cb6..d3f7aee5c 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -56,12 +56,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); -void bli_gemm_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 0f7ecdb11..11c9dd09d 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -50,16 +50,16 @@ void PASTEMAC0(opname) \ thrinfo_t* thread \ ); -GENPROT( gemm_blk_var1f ) -GENPROT( gemm_blk_var2f ) -GENPROT( gemm_blk_var3f ) +GENPROT( gemm_blk_var1 ) +GENPROT( gemm_blk_var2 ) +GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: -GENPROT( gemm_blk_var4f ) // 3m3 -GENPROT( gemm_ker_var3 ) // 4m1b -GENPROT( gemm_ker_var4 ) // 3m2 +GENPROT( gemm_blk_var4 ) // 3m3 +GENPROT( gemm_ker_var3 ) // 4m1b +GENPROT( gemm_ker_var4 ) // 3m2 // diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4f.c b/frame/3/gemm/ind/bli_gemm_blk_var4.c similarity index 97% rename from frame/3/gemm/ind/bli_gemm_blk_var4f.c rename to frame/3/gemm/ind/bli_gemm_blk_var4.c index 9308014d0..10a6afa91 100644 --- a/frame/3/gemm/ind/bli_gemm_blk_var4f.c +++ b/frame/3/gemm/ind/bli_gemm_blk_var4.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_gemm_blk_var4f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_blk_var4 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { //The s is for "lives on the stack" obj_t b_pack_s; diff --git a/frame/3/trsm/old/bli_trsm_blk_var1f.h b/frame/3/gemm/ind/bli_gemm_blk_var4.h similarity index 89% rename from frame/3/trsm/old/bli_trsm_blk_var1f.h rename to frame/3/gemm/ind/bli_gemm_blk_var4.h index df5a9d3fd..d43f56983 100644 --- a/frame/3/trsm/old/bli_trsm_blk_var1f.h +++ b/frame/3/gemm/ind/bli_gemm_blk_var4.h @@ -32,9 +32,13 @@ */ -void bli_trsm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); +void bli_gemm_blk_var4 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.c b/frame/3/gemm/ind/bli_gemm_ker_var3.c index 11c684810..f368a02ab 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.c +++ b/frame/3/gemm/ind/bli_gemm_ker_var3.c @@ -56,12 +56,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var3); -void bli_gemm_ker_var3( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_ker_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.h b/frame/3/gemm/ind/bli_gemm_ker_var3.h index 042120185..06f71bc83 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.h +++ b/frame/3/gemm/ind/bli_gemm_ker_var3.h @@ -36,12 +36,15 @@ // // Prototype object-based interface. // -void bli_gemm_ker_var3( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_gemm_ker_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ); // diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.c b/frame/3/gemm/ind/bli_gemm_ker_var4.c index 3d5cd1859..3ef423c26 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.c +++ b/frame/3/gemm/ind/bli_gemm_ker_var4.c @@ -56,12 +56,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var4); -void bli_gemm_ker_var4( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_ker_var4 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.h b/frame/3/gemm/ind/bli_gemm_ker_var4.h index 95268de2a..ad72fdd67 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.h +++ b/frame/3/gemm/ind/bli_gemm_ker_var4.h @@ -36,12 +36,15 @@ // // Prototype object-based interface. // -void bli_gemm_ker_var4( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_gemm_ker_var4 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ); // diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 21bda90da..4b9f082f6 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_hemm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_hemm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t a_local; obj_t b_local; diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h index 840b24791..c369d7be2 100644 --- a/frame/3/hemm/bli_hemm_front.h +++ b/frame/3/hemm/bli_hemm_front.h @@ -32,12 +32,14 @@ */ -void bli_hemm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_hemm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 263155de2..61d54ca79 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_her2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_her2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t alpha_conj; obj_t c_local; diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h index 8a699c4c4..8a2301eb0 100644 --- a/frame/3/her2k/bli_her2k_front.h +++ b/frame/3/her2k/bli_her2k_front.h @@ -32,11 +32,13 @@ */ -void bli_her2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_her2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1.c similarity index 88% rename from frame/3/herk/bli_herk_blk_var1f.c rename to frame/3/herk/bli_herk_blk_var1.c index 95bc56f9c..535e4f845 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_herk_blk_var1f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_blk_var1 + ( + obj_t* a, + obj_t* ah, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t ah_pack_s; obj_t a1_pack_s, c1_pack_s; @@ -49,9 +52,14 @@ void bli_herk_blk_var1f( obj_t* a, obj_t* c1_pack; obj_t* ah_pack; + dir_t direct; + dim_t i; dim_t b_alg; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, ah, c ); + // Prune any zero region that exists along the partitioning dimension. bli_herk_prune_unref_mparts_m( a, ah, c ); @@ -83,7 +91,7 @@ void bli_herk_blk_var1f( obj_t* a, bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; - bli_thread_get_range_weighted_t2b( thread, c, + bli_thread_get_range_weighted_mdim( direct, thread, c, bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), &my_start, &my_end ); @@ -91,14 +99,14 @@ void bli_herk_blk_var1f( obj_t* a, for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2.c similarity index 88% rename from frame/3/herk/bli_herk_blk_var2f.c rename to frame/3/herk/bli_herk_blk_var2.c index de7f6c972..661d875d3 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_herk_blk_var2f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_blk_var2 + ( + obj_t* a, + obj_t* ah, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t a_pack_s; obj_t ah1_pack_s, c1_pack_s; @@ -49,9 +52,14 @@ void bli_herk_blk_var2f( obj_t* a, obj_t* ah1_pack; obj_t* c1_pack; + dir_t direct; + dim_t i; dim_t b_alg; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, ah, c ); + // Prune any zero region that exists along the partitioning dimension. bli_herk_prune_unref_mparts_n( a, ah, c ); @@ -82,7 +90,7 @@ void bli_herk_blk_var2f( obj_t* a, bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; - bli_thread_get_range_weighted_l2r( thread, c, + bli_thread_get_range_weighted_ndim( direct, thread, c, bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), &my_start, &my_end ); @@ -90,14 +98,14 @@ void bli_herk_blk_var2f( obj_t* a, for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1' and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, ah, &ah1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); // Initialize objects for packing A1' and C1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3.c similarity index 89% rename from frame/3/herk/bli_herk_blk_var3f.c rename to frame/3/herk/bli_herk_blk_var3.c index 7e82ba87f..547c4a37f 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_herk_blk_var3f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_blk_var3 + ( + obj_t* a, + obj_t* ah, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t c_pack_s; obj_t a1_pack_s, ah1_pack_s; @@ -49,10 +52,15 @@ void bli_herk_blk_var3f( obj_t* a, obj_t* ah1_pack = NULL; obj_t* c_pack = NULL; + dir_t direct; + dim_t i; dim_t b_alg; dim_t k_trans; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, ah, c ); + // Prune any zero region that exists along the partitioning dimension. bli_herk_prune_unref_mparts_k( a, ah, c ); @@ -89,14 +97,14 @@ void bli_herk_blk_var3f( obj_t* a, for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, k_trans, a, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_determine_blocksize( direct, i, k_trans, a, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and A1'. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, ah, &ah1 ); // Initialize objects for packing A1 and A1'. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index a4bd5ef0b..201ac45ae 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_herk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_herk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t a_local; obj_t ah_local; diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h index c778399d0..572536493 100644 --- a/frame/3/herk/bli_herk_front.h +++ b/frame/3/herk/bli_herk_front.h @@ -32,10 +32,12 @@ */ -void bli_herk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_herk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index 643a46ba4..409b693a5 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -34,51 +34,38 @@ #include "blis.h" -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][4][3] = +#if 1 +static gemm_voft vars[4][3] = { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_herk_blk_var1f }, - { NULL, bli_herk_l_ker_var2, bli_herk_blk_var2f }, - { NULL, NULL, bli_herk_blk_var3f }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_herk_blk_var1f }, - { NULL, bli_herk_u_ker_var2, bli_herk_blk_var2f }, - { NULL, NULL, bli_herk_blk_var3f }, - { NULL, NULL, NULL }, - } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_herk_blk_var1 }, + { NULL, bli_herk_x_ker_var2, bli_herk_blk_var2 }, + { NULL, NULL, bli_herk_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_herk_int( obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_int + ( + obj_t* alpha, + obj_t* a, + obj_t* ah, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t ah_local; obj_t c_local; +#if 0 + bool_t uplo; +#endif varnum_t n; impl_t i; - bool_t uplo; - FUNCPTR_T f; + gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -129,23 +116,31 @@ void bli_herk_int( obj_t* alpha, bli_obj_scalar_apply_scalar( beta, &c_local ); } +#if 0 // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c_local ) ) uplo = 0; else uplo = 1; +#endif // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. - f = vars[uplo][n][i]; + f = vars[n][i]; + + // Extract the function pointer from the current control tree node. + //f = bli_cntl_sub_prob( cntl ); // Invoke the variant. - f( &a_local, - &ah_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &ah_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/herk/bli_herk_int.h b/frame/3/herk/bli_herk_int.h index 80442d228..c762b9372 100644 --- a/frame/3/herk/bli_herk_int.h +++ b/frame/3/herk/bli_herk_int.h @@ -32,12 +32,14 @@ */ -void bli_herk_int( obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - +void bli_herk_int + ( + obj_t* alpha, + obj_t* a, + obj_t* ah, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 94d6f6a77..0951337dc 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -57,12 +57,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); -void bli_herk_l_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_l_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index cc137d989..2dfec1090 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -57,12 +57,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); -void bli_herk_u_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_u_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/herk/bli_herk_var.h index 03d9b9ff5..fd68d2fd1 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/herk/bli_herk_var.h @@ -50,10 +50,11 @@ void PASTEMAC0(opname) \ thrinfo_t* thread \ ); -GENPROT( herk_blk_var1f ) -GENPROT( herk_blk_var2f ) -GENPROT( herk_blk_var3f ) +GENPROT( herk_blk_var1 ) +GENPROT( herk_blk_var2 ) +GENPROT( herk_blk_var3 ) +GENPROT( herk_x_ker_var2 ) GENPROT( herk_l_ker_var2 ) GENPROT( herk_u_ker_var2 ) diff --git a/frame/3/herk/old/bli_herk_blk_var3f.h b/frame/3/herk/bli_herk_x_ker_var2.c similarity index 73% rename from frame/3/herk/old/bli_herk_blk_var3f.h rename to frame/3/herk/bli_herk_x_ker_var2.c index 800a44b8d..4f29cd4d8 100644 --- a/frame/3/herk/old/bli_herk_blk_var3f.h +++ b/frame/3/herk/bli_herk_x_ker_var2.c @@ -32,10 +32,42 @@ */ -void bli_herk_blk_var3f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); +#include "blis.h" + +static gemm_voft vars[2] = +{ + bli_herk_l_ker_var2, bli_herk_u_ker_var2, +}; + +void bli_herk_x_ker_var2 + ( + obj_t* a, + obj_t* ah, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) +{ + bool_t uplo; + gemm_voft f; + + // Set a bool based on the uplo field of C's root object. + if ( bli_obj_root_is_lower( *c ) ) uplo = 0; + else uplo = 1; + + // Index into the variant array to extract the correct function pointer. + f = vars[uplo]; + + // Call the macrokernel. + f + ( + a, + ah, + c, + cntx, + cntl, + thread + ); +} diff --git a/frame/3/herk/old/bli_herk_blk_var1f.h b/frame/3/herk/old/bli_herk_blk_var1f.h deleted file mode 100644 index bd1d8a95f..000000000 --- a/frame/3/herk/old/bli_herk_blk_var1f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_herk_blk_var1f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - diff --git a/frame/3/herk/old/bli_herk_l_ker_var2.h b/frame/3/herk/old/bli_herk_l_ker_var2.h deleted file mode 100644 index 09656596d..000000000 --- a/frame/3/herk/old/bli_herk_l_ker_var2.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_herk_l_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - herk_thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( herk_l_ker_var2 ) - diff --git a/frame/3/herk/old/bli_herk_thread.c b/frame/3/herk/old/bli_herk_thread.c deleted file mode 100644 index 6bb9d6e98..000000000 --- a/frame/3/herk/old/bli_herk_thread.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_herk_thrinfo_create_paths( void ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/3/herk/old/bli_herk_u_ker_var2.h b/frame/3/herk/old/bli_herk_u_ker_var2.h deleted file mode 100644 index 0701db148..000000000 --- a/frame/3/herk/old/bli_herk_u_ker_var2.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_herk_u_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - herk_thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( herk_u_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_blk_var2b.h b/frame/3/old/bli_herk_direct.c similarity index 89% rename from frame/3/trsm/old/bli_trsm_blk_var2b.h rename to frame/3/old/bli_herk_direct.c index d890990e7..729812e84 100644 --- a/frame/3/trsm/old/bli_trsm_blk_var2b.h +++ b/frame/3/old/bli_herk_direct.c @@ -32,9 +32,15 @@ */ -void bli_trsm_blk_var2b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); +#include "blis.h" + +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* ah, + obj_t* c + ) +{ + return BLIS_FWD; +} diff --git a/frame/3/old/bli_herk_direct.h b/frame/3/old/bli_herk_direct.h new file mode 100644 index 000000000..1f027561c --- /dev/null +++ b/frame/3/old/bli_herk_direct.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* ah, + obj_t* c + ); diff --git a/frame/3/herk/old/bli_herk_thread.h b/frame/3/old/bli_trmm_direct.c similarity index 78% rename from frame/3/herk/old/bli_herk_thread.h rename to frame/3/old/bli_trmm_direct.c index 1feafd113..43be1b16a 100644 --- a/frame/3/herk/old/bli_herk_thread.h +++ b/frame/3/old/bli_trmm_direct.c @@ -32,13 +32,28 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +#include "blis.h" -// For use in herk micro-kernel -#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; -//thrinfo_t** bli_herk_thrinfo_create_paths( void ); + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + + return direct; +} diff --git a/frame/3/trsm/old/bli_trsm_cntx.h b/frame/3/old/bli_trmm_direct.h similarity index 95% rename from frame/3/trsm/old/bli_trsm_cntx.h rename to frame/3/old/bli_trmm_direct.h index 0bdc9e7a8..905ba8fc9 100644 --- a/frame/3/trsm/old/bli_trsm_cntx.h +++ b/frame/3/old/bli_trmm_direct.h @@ -32,6 +32,10 @@ */ -void bli_trsm_cntx_init( void ); -void bli_trsm_cntx_finalize( void ); +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ); diff --git a/frame/3/trsm/old/bli_trsm_thread.h b/frame/3/old/bli_trsm_direct.c similarity index 78% rename from frame/3/trsm/old/bli_trsm_thread.h rename to frame/3/old/bli_trsm_direct.c index 985b6c4a6..c640705c8 100644 --- a/frame/3/trsm/old/bli_trsm_thread.h +++ b/frame/3/old/bli_trsm_direct.c @@ -32,11 +32,28 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +#include "blis.h" -#define trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; -//thrinfo_t** bli_trsm_thrinfo_create_paths( bool_t right_sided ); + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + + return direct; +} diff --git a/frame/3/trsm/old/bli_trsm_blk_var2f.h b/frame/3/old/bli_trsm_direct.h similarity index 89% rename from frame/3/trsm/old/bli_trsm_blk_var2f.h rename to frame/3/old/bli_trsm_direct.h index 8b5d2dd7e..d7e7c206b 100644 --- a/frame/3/trsm/old/bli_trsm_blk_var2f.h +++ b/frame/3/old/bli_trsm_direct.h @@ -32,9 +32,10 @@ */ -void bli_trsm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 79208b699..342f04512 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_symm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_symm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t a_local; obj_t b_local; diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h index 1fb9ec019..b1ee691f2 100644 --- a/frame/3/symm/bli_symm_front.h +++ b/frame/3/symm/bli_symm_front.h @@ -32,12 +32,14 @@ */ -void bli_symm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_symm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 2fa47d27a..7d73dd17d 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_syr2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_syr2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t c_local; obj_t a_local; diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h index 674dfe5ce..502fb033b 100644 --- a/frame/3/syr2k/bli_syr2k_front.h +++ b/frame/3/syr2k/bli_syr2k_front.h @@ -32,11 +32,13 @@ */ -void bli_syr2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_syr2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 54ca2bf8a..e3c62245f 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_syrk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_syrk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t a_local; obj_t at_local; diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h index c7ab2a7b7..700b8e263 100644 --- a/frame/3/syrk/bli_syrk_front.h +++ b/frame/3/syrk/bli_syrk_front.h @@ -32,10 +32,12 @@ */ -void bli_syrk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_syrk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/trmm/bli_trmm_blk_var1.c b/frame/3/trmm/bli_trmm_blk_var1.c new file mode 100644 index 000000000..3b9eae428 --- /dev/null +++ b/frame/3/trmm/bli_trmm_blk_var1.c @@ -0,0 +1,157 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack = NULL; + obj_t* b_pack = NULL; + obj_t* c1_pack = NULL; + + dir_t direct; + + dim_t i; + dim_t b_alg; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_m( a, b, c ); + + if( bli_thread_am_ochief( thread ) ) { + // Initialize object for packing B. + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntx, bli_cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntx, bli_cntl_sub_scalm( cntl ) ); + } + b_pack = bli_thread_obroadcast( thread, &b_pack_s ); + + // Initialize all pack objects that are passed into packm_init(). + if( bli_thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); + + // Pack B (if instructed). + bli_packm_int( b, b_pack, + cntx, bli_cntl_sub_packm_b( cntl ), + bli_thrinfo_sub_opackm( thread ) ); + + dim_t my_start, my_end; + bli_thread_get_range_weighted_mdim( direct, thread, a, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Initialize objects for packing A1 and C1. + if( bli_thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntx, bli_cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntx, bli_cntl_sub_packm_c( cntl ) ); + } + bli_thread_ibarrier( thread ); + + // Pack A1 (if instructed). + bli_packm_int( &a1, a1_pack, + cntx, bli_cntl_sub_packm_a( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + + // Pack C1 (if instructed). + bli_packm_int( &c1, c1_pack, + cntx, bli_cntl_sub_packm_c( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + + // Perform trmm subproblem. + bli_trmm_int( &BLIS_ONE, + a1_pack, + b_pack, + &BLIS_ONE, + c1_pack, + cntx, + bli_cntl_sub_gemm( cntl ), + bli_thrinfo_sub_self( thread ) ); + bli_thread_ibarrier( thread ); + + // Unpack C1 (if C1 was packed). + bli_unpackm_int( c1_pack, &c1, + cntx, bli_cntl_sub_unpackm_c( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + bli_thread_obarrier( thread ); + if( bli_thread_am_ochief( thread ) ) + bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); + if( bli_thread_am_ichief( thread ) ){ + bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); + bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var2.c b/frame/3/trmm/bli_trmm_blk_var2.c new file mode 100644 index 000000000..cf53b8e28 --- /dev/null +++ b/frame/3/trmm/bli_trmm_blk_var2.c @@ -0,0 +1,156 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; + + dir_t direct; + + dim_t i; + dim_t b_alg; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_n( a, b, c ); + + if( bli_thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntx, bli_cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntx, bli_cntl_sub_scalm( cntl ) ); + } + a_pack = bli_thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( bli_thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntx, bli_cntl_sub_packm_a( cntl ), + bli_thrinfo_sub_opackm( thread ) ); + + dim_t my_start, my_end; + bli_thread_get_range_weighted_ndim( direct, thread, b, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Initialize objects for packing A1 and B1. + if( bli_thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntx, bli_cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntx, bli_cntl_sub_packm_c( cntl ) ); + } + bli_thread_ibarrier( thread ); + + // Pack B1 (if instructed). + bli_packm_int( &b1, b1_pack, + cntx, bli_cntl_sub_packm_b( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + + // Pack C1 (if instructed). + bli_packm_int( &c1, c1_pack, + cntx, bli_cntl_sub_packm_c( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + + // Perform trmm subproblem. + bli_trmm_int( &BLIS_ONE, + a_pack, + b1_pack, + &BLIS_ONE, + c1_pack, + cntx, + bli_cntl_sub_gemm( cntl ), + bli_thrinfo_sub_self( thread ) ); + bli_thread_ibarrier( thread ); + + // Unpack C1 (if C1 was packed). + bli_unpackm_int( c1_pack, &c1, + cntx, bli_cntl_sub_unpackm_c( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + bli_thread_obarrier( thread ); + if( bli_thread_am_ochief( thread ) ) + bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); + if( bli_thread_am_ichief( thread ) ) { + bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); + bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var3.c b/frame/3/trmm/bli_trmm_blk_var3.c new file mode 100644 index 000000000..f6a425b07 --- /dev/null +++ b/frame/3/trmm/bli_trmm_blk_var3.c @@ -0,0 +1,160 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) +{ + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_k( a, b, c ); + + if( bli_thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntx, bli_cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntx, bli_cntl_sub_scalm( cntl ) ); + } + c_pack = bli_thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( bli_thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntx, bli_cntl_sub_packm_c( cntl ), + bli_thrinfo_sub_opackm( thread ) ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + // NOTE: We call a trmm-specific function to determine the kc + // blocksize so that we can implement the "nudging" of kc to be + // a multiple of mr or nr, as needed. + b_alg = bli_trmm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Initialize objects for packing A1 and B1. + if( bli_thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntx, bli_cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntx, bli_cntl_sub_packm_b( cntl ) ); + } + bli_thread_ibarrier( thread ); + + // Pack A1 (if instructed). + bli_packm_int( &a1, a1_pack, + cntx, bli_cntl_sub_packm_a( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + + // Pack B1 (if instructed). + bli_packm_int( &b1, b1_pack, + cntx, bli_cntl_sub_packm_b( cntl ), + bli_thrinfo_sub_ipackm( thread ) ); + + // Perform trmm subproblem. + bli_trmm_int( &BLIS_ONE, + a1_pack, + b1_pack, + &BLIS_ONE, + c_pack, + cntx, + bli_cntl_sub_gemm( cntl ), + bli_thrinfo_sub_self( thread ) ); + bli_thread_ibarrier( thread ); + } + + bli_thread_obarrier( thread ); + + // Unpack C (if C was packed). + bli_unpackm_int( c_pack, c, + cntx, bli_cntl_sub_unpackm_c( cntl ), + bli_thrinfo_sub_opackm( thread ) ); + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + if( bli_thread_am_ochief( thread ) ){ + bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); + } + if( bli_thread_am_ichief( thread ) ){ + bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); + bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); + } +} + diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 9b860405c..55f58974b 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_trmm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - gemm_t* cntl ) +void bli_trmm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t a_local; obj_t b_local; diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h index a05284336..d47c40ce9 100644 --- a/frame/3/trmm/bli_trmm_front.h +++ b/frame/3/trmm/bli_trmm_front.h @@ -32,10 +32,12 @@ */ -void bli_trmm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_trmm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index d39722e95..d6f4ca4be 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -34,73 +34,38 @@ #include "blis.h" -#define FUNCPTR_T trmm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][2][4][3] = +#if 1 +static gemm_voft vars[4][3] = { - // left - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_ll_ker_var2, bli_trmm_blk_var2b }, - { NULL, NULL, bli_trmm_blk_var3b }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_lu_ker_var2, bli_trmm_blk_var2f }, - { NULL, NULL, bli_trmm_blk_var3f }, - { NULL, NULL, NULL }, - } - }, - // right - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_rl_ker_var2, bli_trmm_blk_var2f }, - { NULL, NULL, bli_trmm_blk_var3f }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_ru_ker_var2, bli_trmm_blk_var2b }, - { NULL, NULL, bli_trmm_blk_var3b }, - { NULL, NULL, NULL }, - } - } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_trmm_blk_var1 }, + { NULL, bli_trmm_xx_ker_var2, bli_trmm_blk_var2 }, + { NULL, NULL, bli_trmm_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_trmm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; +#if 0 bool_t side, uplo; +#endif varnum_t n; impl_t i; - FUNCPTR_T f; + gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -151,6 +116,7 @@ void bli_trmm_int( obj_t* alpha, bli_obj_scalar_apply_scalar( beta, &c_local ); } +#if 0 // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). @@ -163,24 +129,30 @@ void bli_trmm_int( obj_t* alpha, else // if ( bli_obj_root_is_triangular( *b ) ) { side = 1; - // Set a bool based on the uplo field of A's root object. if ( bli_obj_root_is_lower( *b ) ) uplo = 0; else uplo = 1; } +#endif // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo][n][i]; + f = vars[n][i]; + + // Extract the function pointer from the current control tree node. + //f = bli_cntl_sub_prob( cntl ); // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/trmm/bli_trmm_int.h b/frame/3/trmm/bli_trmm_int.h index e529d02f6..29c578324 100644 --- a/frame/3/trmm/bli_trmm_int.h +++ b/frame/3/trmm/bli_trmm_int.h @@ -32,11 +32,15 @@ */ -void bli_trmm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_trmm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ); + diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 34928b04d..61843d4c2 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); -void bli_trmm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 382d54952..356ea1a37 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); -void bli_trmm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 72ac03a14..581cfdf8d 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); -void bli_trmm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 0bae832d3..8033c42c2 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); -void bli_trmm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index e10166401..12bfa0b9f 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -50,13 +50,11 @@ void PASTEMAC0(opname) \ thrinfo_t* thread \ ); -GENPROT( trmm_blk_var1f ) -//GENPROT( trmm_blk_var1b ) // variant doesn't exist b/c it's not needed -GENPROT( trmm_blk_var2f ) -GENPROT( trmm_blk_var2b ) -GENPROT( trmm_blk_var3f ) -GENPROT( trmm_blk_var3b ) +GENPROT( trmm_blk_var1 ) +GENPROT( trmm_blk_var2 ) +GENPROT( trmm_blk_var3 ) +GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) diff --git a/frame/3/trmm/old/bli_trmm_thread.h b/frame/3/trmm/bli_trmm_xx_ker_var2.c similarity index 61% rename from frame/3/trmm/old/bli_trmm_thread.h rename to frame/3/trmm/bli_trmm_xx_ker_var2.c index bedc7781f..5b0a89659 100644 --- a/frame/3/trmm/old/bli_trmm_thread.h +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -32,14 +32,56 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +#include "blis.h" -#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +static gemm_voft vars[2][2] = +{ + { bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 }, + { bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 } +}; -//thrinfo_t** bli_trmm_thrinfo_create_paths( bool_t jc_dependency ); +void bli_trmm_xx_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl, + thrinfo_t* thread + ) +{ + bool_t side; + bool_t uplo; + gemm_voft f; + + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( *a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( *a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( *b ) ) uplo = 0; + else uplo = 1; + } + + // Index into the variant array to extract the correct function pointer. + f = vars[side][uplo]; + + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread + ); +} diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/old/bli_trmm_blk_var1f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var1f.c rename to frame/3/trmm/old/bli_trmm_blk_var1f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var1f.h b/frame/3/trmm/old/bli_trmm_blk_var1f.h deleted file mode 100644 index e0876af88..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var1f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/old/bli_trmm_blk_var2b.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var2b.c rename to frame/3/trmm/old/bli_trmm_blk_var2b.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var2b.h b/frame/3/trmm/old/bli_trmm_blk_var2b.h deleted file mode 100644 index 35f41a9af..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var2b.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var2b( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/old/bli_trmm_blk_var2f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var2f.c rename to frame/3/trmm/old/bli_trmm_blk_var2f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var2f.h b/frame/3/trmm/old/bli_trmm_blk_var2f.h deleted file mode 100644 index 7ed265e42..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var2f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/old/bli_trmm_blk_var3b.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var3b.c rename to frame/3/trmm/old/bli_trmm_blk_var3b.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var3b.h b/frame/3/trmm/old/bli_trmm_blk_var3b.h deleted file mode 100644 index 4e9113c6a..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var3b.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var3b( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/old/bli_trmm_blk_var3f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var3f.c rename to frame/3/trmm/old/bli_trmm_blk_var3f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var3f.h b/frame/3/trmm/old/bli_trmm_blk_var3f.h deleted file mode 100644 index 50d8c6bbb..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var3f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/old/bli_trmm_ll_ker_var2.h b/frame/3/trmm/old/bli_trmm_ll_ker_var2.h deleted file mode 100644 index 384defe09..000000000 --- a/frame/3/trmm/old/bli_trmm_ll_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_lu_ker_var2.h b/frame/3/trmm/old/bli_trmm_lu_ker_var2.h deleted file mode 100644 index 74a17e6b4..000000000 --- a/frame/3/trmm/old/bli_trmm_lu_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_rl_ker_var2.h b/frame/3/trmm/old/bli_trmm_rl_ker_var2.h deleted file mode 100644 index 64d1128fb..000000000 --- a/frame/3/trmm/old/bli_trmm_rl_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_ru_ker_var2.h b/frame/3/trmm/old/bli_trmm_ru_ker_var2.h deleted file mode 100644 index 3df303b60..000000000 --- a/frame/3/trmm/old/bli_trmm_ru_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_thread.c b/frame/3/trmm/old/bli_trmm_thread.c deleted file mode 100644 index b17c30dd6..000000000 --- a/frame/3/trmm/old/bli_trmm_thread.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_trmm_thrinfo_create_paths( bool_t jc_dependency ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); - - if ( jc_dependency ) - { - jr_way *= jc_way; - jc_way = 1; - } -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 19090bee5..eb816d8fc 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_trmm3_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_trmm3_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ) { obj_t a_local; obj_t b_local; diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h index 052d83249..8f4feaba1 100644 --- a/frame/3/trmm3/bli_trmm3_front.h +++ b/frame/3/trmm3/bli_trmm3_front.h @@ -32,11 +32,14 @@ */ -void bli_trmm3_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); +void bli_trmm3_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + gemm_t* cntl + ); diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1.c similarity index 86% rename from frame/3/trsm/bli_trsm_blk_var1f.c rename to frame/3/trsm/bli_trsm_blk_var1.c index b3a73da6e..7d479a90a 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_trsm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { obj_t b_pack_s; obj_t a1_pack_s; @@ -48,9 +51,14 @@ void bli_trsm_blk_var1f( obj_t* a, obj_t* b_pack = NULL; obj_t* a1_pack = NULL; + dir_t direct; + dim_t i; dim_t b_alg; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trsm_direct( a, b, c ); + // Prune any zero region that exists along the partitioning dimension. bli_trsm_prune_unref_mparts_m( a, b, c ); @@ -74,7 +82,7 @@ void bli_trsm_blk_var1f( obj_t* a, bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, + bli_thread_get_range_mdim( direct, thread, a, ( bli_obj_root_is_triangular( *a ) ? bli_cntx_get_bmult( BLIS_MR, cntx ) : bli_cntx_get_bmult( BLIS_NR, cntx ) ), @@ -84,14 +92,14 @@ void bli_trsm_blk_var1f( obj_t* a, for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); // Initialize object for packing A1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2.c similarity index 88% rename from frame/3/trsm/bli_trsm_blk_var2f.c rename to frame/3/trsm/bli_trsm_blk_var2.c index 42d65100e..f9bd6d135 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_trsm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; @@ -49,9 +52,14 @@ void bli_trsm_blk_var2f( obj_t* a, obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; + dir_t direct; + dim_t i; dim_t b_alg; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trsm_direct( a, b, c ); + // Prune any zero region that exists along the partitioning dimension. bli_trsm_prune_unref_mparts_n( a, b, c ); @@ -84,7 +92,7 @@ void bli_trsm_blk_var2f( obj_t* a, bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; - bli_thread_get_range_l2r( thread, b, + bli_thread_get_range_ndim( direct, thread, b, ( bli_obj_root_is_triangular( *b ) ? bli_cntx_get_bmult( BLIS_MR, cntx ) : bli_cntx_get_bmult( BLIS_NR, cntx ) ), @@ -94,14 +102,14 @@ void bli_trsm_blk_var2f( obj_t* a, for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, b, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3.c similarity index 89% rename from frame/3/trsm/bli_trsm_blk_var3f.c rename to frame/3/trsm/bli_trsm_blk_var3.c index 52cfb1fc5..130d2281c 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_trsm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { obj_t c_pack_s; obj_t a1_pack_s, b1_pack_s; @@ -49,10 +52,15 @@ void bli_trsm_blk_var3f( obj_t* a, obj_t* b1_pack = NULL; obj_t* c_pack = NULL; + dir_t direct; + dim_t i; dim_t b_alg; dim_t k_trans; + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trsm_direct( a, b, c ); + // Prune any zero region that exists along the partitioning dimension. bli_trsm_prune_unref_mparts_k( a, b, c ); @@ -93,14 +101,14 @@ void bli_trsm_blk_var3f( obj_t* a, // NOTE: We call a trsm-specific function to determine the kc // blocksize so that we can implement the "nudging" of kc to be // a multiple of mr, as needed. - b_alg = bli_trsm_determine_kc_f( i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. if( bli_thread_am_ichief( thread ) ) { diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 53cdbb1c8..0e6e5a2c2 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_trsm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl ) +void bli_trsm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + trsm_t* l_cntl, + trsm_t* r_cntl + ) { trsm_t* cntl; obj_t a_local; diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index 6ee063797..c80156d72 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -32,11 +32,13 @@ */ -void bli_trsm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl ); - +void bli_trsm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + trsm_t* l_cntl, + trsm_t* r_cntl + ); diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index 123ef6585..a517a6cc3 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -34,73 +34,38 @@ #include "blis.h" -#define FUNCPTR_T trsm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][2][4][3] = +#if 1 +static trsm_voft vars[4][3] = { - // left - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1f }, - { NULL, bli_trsm_ll_ker_var2, bli_trsm_blk_var2f }, - { NULL, NULL, bli_trsm_blk_var3f }, - { NULL, NULL, NULL, }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1b }, - { NULL, bli_trsm_lu_ker_var2, bli_trsm_blk_var2b }, - { NULL, NULL, bli_trsm_blk_var3b }, - { NULL, NULL, NULL, }, - } - }, - // right - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1b }, - { NULL, bli_trsm_rl_ker_var2, bli_trsm_blk_var2b }, - { NULL, NULL, bli_trsm_blk_var3b }, - { NULL, NULL, NULL, }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1f }, - { NULL, bli_trsm_ru_ker_var2, bli_trsm_blk_var2f }, - { NULL, NULL, bli_trsm_blk_var3f }, - { NULL, NULL, NULL, }, - } - } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_trsm_blk_var1 }, + { NULL, bli_trsm_xx_ker_var2, bli_trsm_blk_var2 }, + { NULL, NULL, bli_trsm_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_trsm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; +#if 0 bool_t side, uplo; +#endif varnum_t n; impl_t i; - FUNCPTR_T f; + trsm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -148,9 +113,11 @@ void bli_trsm_int( obj_t* alpha, // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( *a ) ) { +#if 0 side = 0; if ( bli_obj_root_is_lower( *a ) ) uplo = 0; else uplo = 1; +#endif // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). @@ -161,10 +128,12 @@ void bli_trsm_int( obj_t* alpha, } else // if ( bli_obj_root_is_triangular( *b ) ) { +#if 0 side = 1; // Set a bool based on the uplo field of A's root object. if ( bli_obj_root_is_lower( *b ) ) uplo = 0; else uplo = 1; +#endif // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). @@ -181,14 +150,20 @@ void bli_trsm_int( obj_t* alpha, i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo][n][i]; + f = vars[n][i]; + + // Extract the function pointer from the current control tree node. + //f = bli_cntl_sub_prob( cntl ); // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/trsm/bli_trsm_int.h index deecc6565..a379ea002 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/trsm/bli_trsm_int.h @@ -32,11 +32,15 @@ */ -void bli_trsm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ); +void bli_trsm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ); + diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index db4668d1f..07ba4361f 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); -void bli_trsm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 17041d986..ba34f1c3a 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); -void bli_trsm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 40f3d5511..7a25b1ce5 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); -void bli_trsm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 6482fa777..d610925f3 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); -void bli_trsm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 2a2c0efc8..9c526d820 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -50,13 +50,11 @@ void PASTEMAC0(opname) \ thrinfo_t* thread \ ); -GENPROT( trsm_blk_var1f ) -GENPROT( trsm_blk_var1b ) -GENPROT( trsm_blk_var2f ) -GENPROT( trsm_blk_var2b ) -GENPROT( trsm_blk_var3f ) -GENPROT( trsm_blk_var3b ) +GENPROT( trsm_blk_var1 ) +GENPROT( trsm_blk_var2 ) +GENPROT( trsm_blk_var3 ) +GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c new file mode 100644 index 000000000..ad1238319 --- /dev/null +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -0,0 +1,87 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static trsm_voft vars[2][2] = +{ + { bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 }, + { bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 } +}; + +void bli_trsm_xx_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + trsm_t* cntl, + thrinfo_t* thread + ) +{ + bool_t side; + bool_t uplo; + trsm_voft f; + + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( *a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( *a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( *b ) ) uplo = 0; + else uplo = 1; + } + + // Index into the variant array to extract the correct function pointer. + f = vars[side][uplo]; + + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread + ); +} + diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/old/bli_trsm_blk_var1b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var1b.c rename to frame/3/trsm/old/bli_trsm_blk_var1b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var1b.h b/frame/3/trsm/old/bli_trsm_blk_var1b.h deleted file mode 100644 index 77601bb76..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var1b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var1b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/old/bli_trsm_blk_var2b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var2b.c rename to frame/3/trsm/old/bli_trsm_blk_var2b.c diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/old/bli_trsm_blk_var3b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var3b.c rename to frame/3/trsm/old/bli_trsm_blk_var3b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var3b.h b/frame/3/trsm/old/bli_trsm_blk_var3b.h deleted file mode 100644 index 5cab7bdcf..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var3b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var3b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_blk_var3f.h b/frame/3/trsm/old/bli_trsm_blk_var3f.h deleted file mode 100644 index 2c6fbb214..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var3f.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_ll_ker_var2.h b/frame/3/trsm/old/bli_trsm_ll_ker_var2.h deleted file mode 100644 index 09812df14..000000000 --- a/frame/3/trsm/old/bli_trsm_ll_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_lu_ker_var2.h b/frame/3/trsm/old/bli_trsm_lu_ker_var2.h deleted file mode 100644 index aa7c8ed47..000000000 --- a/frame/3/trsm/old/bli_trsm_lu_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_rl_ker_var2.h b/frame/3/trsm/old/bli_trsm_rl_ker_var2.h deleted file mode 100644 index 0fd7e6bbe..000000000 --- a/frame/3/trsm/old/bli_trsm_rl_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_ru_ker_var2.h b/frame/3/trsm/old/bli_trsm_ru_ker_var2.h deleted file mode 100644 index a30e20070..000000000 --- a/frame/3/trsm/old/bli_trsm_ru_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_thread.c b/frame/3/trsm/old/bli_trsm_thread.c deleted file mode 100644 index 1a9f4ec16..000000000 --- a/frame/3/trsm/old/bli_trsm_thread.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_trsm_thrinfo_create_paths( bool_t right_sided ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); - - if ( right_sided ) - { - ic_way = jc_way * ic_way * jr_way; - - jc_way = 1; - kc_way = 1; - jr_way = 1; - ir_way = 1; - } - else - { - jr_way = ic_way * jr_way * ir_way; - - jc_way = 1; - kc_way = 1; - ic_way = 1; - ir_way = 1; - } -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index e7bd0be2a..833dadb42 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -35,10 +35,13 @@ #include "blis.h" -blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ) +blksz_t* bli_blksz_obj_create + ( + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ) { blksz_t* b; @@ -53,11 +56,14 @@ blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, return b; } -void bli_blksz_obj_init( blksz_t* b, - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ) +void bli_blksz_obj_init + ( + blksz_t* b, + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ) { b->v[BLIS_FLOAT] = b_s; b->v[BLIS_DOUBLE] = b_d; @@ -69,15 +75,21 @@ void bli_blksz_obj_init( blksz_t* b, b->e[BLIS_DCOMPLEX] = be_z; } -void bli_blksz_obj_free( blksz_t* b ) +void bli_blksz_obj_free + ( + blksz_t* b + ) { bli_free_intl( b ); } // ----------------------------------------------------------------------------- -void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz ) +void bli_blksz_reduce_dt_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) { dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); @@ -107,11 +119,30 @@ void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, // ----------------------------------------------------------------------------- -dim_t bli_determine_blocksize_f( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ) +dim_t bli_determine_blocksize + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + if ( direct == BLIS_FWD ) + return bli_determine_blocksize_f( i, dim, obj, bszid, cntx ); + else + return bli_determine_blocksize_b( i, dim, obj, bszid, cntx ); +} + +dim_t bli_determine_blocksize_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) { num_t dt; blksz_t* bsize; @@ -130,10 +161,39 @@ dim_t bli_determine_blocksize_f( dim_t i, return b_use; } -dim_t bli_determine_blocksize_f_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ) +dim_t bli_determine_blocksize_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + num_t dt; + blksz_t* bsize; + dim_t b_alg, b_max; + dim_t b_use; + + // Extract the execution datatype and use it to query the corresponding + // blocksize and blocksize maximum values from the blksz_t object. + dt = bli_obj_execution_datatype( *obj ); + bsize = bli_cntx_get_blksz( bszid, cntx ); + b_alg = bli_blksz_get_def( dt, bsize ); + b_max = bli_blksz_get_max( dt, bsize ); + + b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); + + return b_use; +} + +dim_t bli_determine_blocksize_f_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ) { dim_t b_now; dim_t dim_left_now; @@ -161,33 +221,13 @@ dim_t bli_determine_blocksize_f_sub( dim_t i, return b_now; } -dim_t bli_determine_blocksize_b( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ) -{ - num_t dt; - blksz_t* bsize; - dim_t b_alg, b_max; - dim_t b_use; - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_determine_blocksize_b_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ) +dim_t bli_determine_blocksize_b_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ) { dim_t b_now; dim_t dim_left_now; diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index 11a8cb650..daffb3772 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -97,43 +97,79 @@ // ----------------------------------------------------------------------------- -blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ); +blksz_t* bli_blksz_obj_create + ( + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ); -void bli_blksz_obj_init( blksz_t* b, - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ); +void bli_blksz_obj_init + ( + blksz_t* b, + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ); -void bli_blksz_obj_free( blksz_t* b ); +void bli_blksz_obj_free + ( + blksz_t* b + ); // ----------------------------------------------------------------------------- -void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz ); +void bli_blksz_reduce_dt_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); // ----------------------------------------------------------------------------- -dim_t bli_determine_blocksize_f( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ); -dim_t bli_determine_blocksize_f_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ); +dim_t bli_determine_blocksize + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); -dim_t bli_determine_blocksize_b( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ); -dim_t bli_determine_blocksize_b_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ); +dim_t bli_determine_blocksize_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_f_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ); + +dim_t bli_determine_blocksize_b_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ); diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index 64718353e..8951a1d62 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -38,11 +38,31 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_mdim + ( + dir_t direct, + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( direct == BLIS_FWD ) + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); + else + bli_acquire_mpart_b2t( req_part, i, b, obj, sub_obj ); +} + + +void bli_acquire_mpart_t2b + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -59,14 +79,14 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_packm_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_t2b_check( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b_check( req_part, i, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -90,7 +110,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. - if ( requested_part == BLIS_SUBPART0 ) + if ( req_part == BLIS_SUBPART0 ) { // A0 (offm,offn) unchanged. // A0 is i x n. @@ -99,7 +119,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = i; n_part = n; } - else if ( requested_part == BLIS_SUBPART1T ) + else if ( req_part == BLIS_SUBPART1T ) { // A1T (offm,offn) unchanged. // A1T is (i+b) x n. @@ -108,7 +128,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = i + b; n_part = n; } - else if ( requested_part == BLIS_SUBPART1 ) + else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (i,0). // A1 is b x n. @@ -117,7 +137,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = b; n_part = n; } - else if ( requested_part == BLIS_SUBPART1B ) + else if ( req_part == BLIS_SUBPART1B ) { // A1B (offm,offn) += (i,0). // A1B is (m-i) x n. @@ -126,7 +146,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = m - i; n_part = n; } - else // if ( requested_part == BLIS_SUBPART2 ) + else // if ( req_part == BLIS_SUBPART2 ) { // A2 (offm,offn) += (i+b,0). // A2 is (m-i-b) x n. @@ -208,11 +228,14 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, } -void bli_acquire_mpart_b2t( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_b2t + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; @@ -222,15 +245,35 @@ void bli_acquire_mpart_b2t( subpart_t requested_part, // Modify i to account for the fact that we are moving backwards. i = m - i - b; - bli_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); } -void bli_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_ndim + ( + dir_t direct, + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( direct == BLIS_FWD ) + bli_acquire_mpart_l2r( req_part, i, b, obj, sub_obj ); + else + bli_acquire_mpart_r2l( req_part, i, b, obj, sub_obj ); +} + + +void bli_acquire_mpart_l2r + ( + subpart_t req_part, + dim_t j, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -247,14 +290,14 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_l2r( requested_part, j, b, obj, sub_obj ); + bli_packm_acquire_mpart_l2r( req_part, j, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_l2r_check( requested_part, j, b, obj, sub_obj ); + bli_acquire_mpart_l2r_check( req_part, j, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -278,7 +321,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. - if ( requested_part == BLIS_SUBPART0 ) + if ( req_part == BLIS_SUBPART0 ) { // A0 (offm,offn) unchanged. // A0 is m x j. @@ -287,7 +330,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = j; } - else if ( requested_part == BLIS_SUBPART1L ) + else if ( req_part == BLIS_SUBPART1L ) { // A1L (offm,offn) unchanged. // A1L is m x (j+b). @@ -296,7 +339,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = j + b; } - else if ( requested_part == BLIS_SUBPART1 ) + else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (0,j). // A1 is m x b. @@ -305,7 +348,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = b; } - else if ( requested_part == BLIS_SUBPART1R ) + else if ( req_part == BLIS_SUBPART1R ) { // A1R (offm,offn) += (0,j). // A1R is m x (n-j). @@ -314,7 +357,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = n - j; } - else // if ( requested_part == BLIS_SUBPART2 ) + else // if ( req_part == BLIS_SUBPART2 ) { // A2 (offm,offn) += (0,j+b). // A2 is m x (n-j-b). @@ -395,11 +438,14 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, } -void bli_acquire_mpart_r2l( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_r2l + ( + subpart_t req_part, + dim_t j, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t n; @@ -409,15 +455,18 @@ void bli_acquire_mpart_r2l( subpart_t requested_part, // Modify i to account for the fact that we are moving backwards. j = n - j - b; - bli_acquire_mpart_l2r( requested_part, j, b, obj, sub_obj ); + bli_acquire_mpart_l2r( req_part, j, b, obj, sub_obj ); } -void bli_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_tl2br + ( + subpart_t req_part, + dim_t ij, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -435,14 +484,14 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_tl2br( requested_part, ij, b, obj, sub_obj ); + bli_packm_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_tl2br_check( requested_part, ij, b, obj, sub_obj ); + bli_acquire_mpart_tl2br_check( req_part, ij, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -469,7 +518,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // subpartition is being requested, assuming no transposition. // Left column of subpartitions - if ( requested_part == BLIS_SUBPART00 ) + if ( req_part == BLIS_SUBPART00 ) { // A00 (offm,offn) unchanged. // A00 is ij x ij. @@ -478,7 +527,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = ij; } - else if ( requested_part == BLIS_SUBPART10 ) + else if ( req_part == BLIS_SUBPART10 ) { // A10 (offm,offn) += (ij,0). // A10 is b x ij. @@ -487,7 +536,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = ij; } - else if ( requested_part == BLIS_SUBPART20 ) + else if ( req_part == BLIS_SUBPART20 ) { // A20 (offm,offn) += (ij+b,0). // A20 is (m-ij-b) x ij. @@ -498,7 +547,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } // Middle column of subpartitions. - else if ( requested_part == BLIS_SUBPART01 ) + else if ( req_part == BLIS_SUBPART01 ) { // A01 (offm,offn) += (0,ij). // A01 is ij x b. @@ -507,7 +556,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = b; } - else if ( requested_part == BLIS_SUBPART11 ) + else if ( req_part == BLIS_SUBPART11 ) { // A11 (offm,offn) += (ij,ij). // A11 is b x b. @@ -516,7 +565,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = b; } - else if ( requested_part == BLIS_SUBPART21 ) + else if ( req_part == BLIS_SUBPART21 ) { // A21 (offm,offn) += (ij+b,ij). // A21 is (m-ij-b) x b. @@ -527,7 +576,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } // Right column of subpartitions. - else if ( requested_part == BLIS_SUBPART02 ) + else if ( req_part == BLIS_SUBPART02 ) { // A02 (offm,offn) += (0,ij+b). // A02 is ij x (n-ij-b). @@ -536,7 +585,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = n - ij - b; } - else if ( requested_part == BLIS_SUBPART12 ) + else if ( req_part == BLIS_SUBPART12 ) { // A12 (offm,offn) += (ij,ij+b). // A12 is b x (n-ij-b). @@ -545,7 +594,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = n - ij - b; } - else // if ( requested_part == BLIS_SUBPART22 ) + else // if ( req_part == BLIS_SUBPART22 ) { // A22 (offm,offn) += (ij+b,ij+b). // A22 is (m-ij-b) x (n-ij-b). @@ -588,9 +637,9 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // we let the subpartition inherit the storage structure of its immediate // parent. if ( !bli_obj_root_is_general( *sub_obj ) && - requested_part != BLIS_SUBPART00 && - requested_part != BLIS_SUBPART11 && - requested_part != BLIS_SUBPART22 ) + req_part != BLIS_SUBPART00 && + req_part != BLIS_SUBPART11 && + req_part != BLIS_SUBPART22 ) { // FGVZ: Fix me. This needs to be cleaned up. Either non-diagonal // intersecting subpartitions should inherit their root object's @@ -638,11 +687,14 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } -void bli_acquire_mpart_br2tl( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_br2tl + ( + subpart_t req_part, + dim_t ij, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { // Query the dimension of the object. dim_t mn = bli_obj_length( *obj ); @@ -650,35 +702,41 @@ void bli_acquire_mpart_br2tl( subpart_t requested_part, // Modify ij to account for the fact that we are moving backwards. ij = mn - ij - b; - bli_acquire_mpart_tl2br( requested_part, ij, b, obj, sub_obj ); + bli_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj ); } // -- Vector partitioning ------------------------------------------------------ -void bli_acquire_vpart_f2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_vpart_f2b + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { if ( bli_obj_is_col_vector( *obj ) ) - bli_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( *obj ) ) - bli_acquire_mpart_l2r( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_l2r( req_part, i, b, obj, sub_obj ); } -void bli_acquire_vpart_b2f( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_vpart_b2f + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { if ( bli_obj_is_col_vector( *obj ) ) - bli_acquire_mpart_b2t( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_b2t( req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( *obj ) ) - bli_acquire_mpart_r2l( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_r2l( req_part, i, b, obj, sub_obj ); } diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h index ed1fa0d15..0d3d021b4 100644 --- a/frame/base/bli_part.h +++ b/frame/base/bli_part.h @@ -36,50 +36,43 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_b2t( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + obj_t* obj, \ + obj_t* sub_obj \ + ); -void bli_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_r2l( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +GENPROT( acquire_mpart_mdim ) +GENPROT( acquire_mpart_ndim ) -void bli_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_br2tl( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0( opname ) \ + ( \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + obj_t* obj, \ + obj_t* sub_obj \ + ); + +GENPROT( acquire_mpart_t2b ) +GENPROT( acquire_mpart_b2t ) +GENPROT( acquire_mpart_l2r ) +GENPROT( acquire_mpart_r2l ) +GENPROT( acquire_mpart_tl2br ) +GENPROT( acquire_mpart_br2tl ) // -- Vector partitioning ------------------------------------------------------ -void bli_acquire_vpart_f2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_acquire_vpart_b2f( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +GENPROT( acquire_vpart_f2b ) +GENPROT( acquire_vpart_b2f ) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 7274ce5a6..8168a1a7c 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -501,6 +501,15 @@ typedef enum } packbuf_t; +// -- Partitioning direction -- + +typedef enum +{ + BLIS_FWD, + BLIS_BWD +} dir_t; + + // // -- BLIS misc. structure types ----------------------------------------------- // diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index a4f69aeba..1cbd6eefe 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -70,7 +70,7 @@ bool_t bli_thread_is_initialized( void ) // ----------------------------------------------------------------------------- -void bli_thread_get_range +void bli_thread_get_range_sub ( thrinfo_t* thread, dim_t n, @@ -211,6 +211,38 @@ void bli_thread_get_range } } +siz_t bli_thread_get_range_mdim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + if ( direct == BLIS_FWD ) + return bli_thread_get_range_t2b( thr, a, bmult, start, end ); + else + return bli_thread_get_range_b2t( thr, a, bmult, start, end ); +} + +siz_t bli_thread_get_range_ndim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + if ( direct == BLIS_FWD ) + return bli_thread_get_range_l2r( thr, a, bmult, start, end ); + else + return bli_thread_get_range_r2l( thr, a, bmult, start, end ); +} + siz_t bli_thread_get_range_l2r ( thrinfo_t* thr, @@ -224,8 +256,8 @@ siz_t bli_thread_get_range_l2r dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, n, bf, - FALSE, start, end ); + bli_thread_get_range_sub( thr, n, bf, + FALSE, start, end ); return m * ( *end - *start ); } @@ -243,8 +275,8 @@ siz_t bli_thread_get_range_r2l dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, n, bf, - TRUE, start, end ); + bli_thread_get_range_sub( thr, n, bf, + TRUE, start, end ); return m * ( *end - *start ); } @@ -262,8 +294,8 @@ siz_t bli_thread_get_range_t2b dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, m, bf, - FALSE, start, end ); + bli_thread_get_range_sub( thr, m, bf, + FALSE, start, end ); return n * ( *end - *start ); } @@ -281,12 +313,14 @@ siz_t bli_thread_get_range_b2t dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, m, bf, - TRUE, start, end ); + bli_thread_get_range_sub( thr, m, bf, + TRUE, start, end ); return n * ( *end - *start ); } +// ----------------------------------------------------------------------------- + dim_t bli_thread_get_range_width_l ( doff_t diagoff_j, @@ -496,7 +530,9 @@ siz_t bli_find_area_trap_l return ( siz_t )area; } -siz_t bli_thread_get_range_weighted +// ----------------------------------------------------------------------------- + +siz_t bli_thread_get_range_weighted_sub ( thrinfo_t* thread, doff_t diagoff, @@ -570,11 +606,15 @@ siz_t bli_thread_get_range_weighted { // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. - width_j = bli_thread_get_range_width_l( diagoff_j, m, n_left, - j, n_way, - bf, bf_left, - area_per_thr, - handle_edge_low ); + width_j = + bli_thread_get_range_width_l + ( + diagoff_j, m, n_left, + j, n_way, + bf, bf_left, + area_per_thr, + handle_edge_low + ); // If the current thread belongs to caucus j, this is his // subpartition. So we compute the implied index range and @@ -611,9 +651,12 @@ siz_t bli_thread_get_range_weighted bli_toggle_bool( handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. - area = bli_thread_get_range_weighted( thread, diagoff, uplo, m, n, bf, - handle_edge_low, - j_start_thr, j_end_thr ); + area = bli_thread_get_range_weighted_sub + ( + thread, diagoff, uplo, m, n, bf, + handle_edge_low, + j_start_thr, j_end_thr + ); // Reverse the indexing basis for the subpartition ranges so that // the indices, relative to left-to-right iteration through the @@ -626,6 +669,38 @@ siz_t bli_thread_get_range_weighted return area; } +siz_t bli_thread_get_range_weighted_mdim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + if ( direct == BLIS_FWD ) + return bli_thread_get_range_t2b( thr, a, bmult, start, end ); + else + return bli_thread_get_range_b2t( thr, a, bmult, start, end ); +} + +siz_t bli_thread_get_range_weighted_ndim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + if ( direct == BLIS_FWD ) + return bli_thread_get_range_l2r( thr, a, bmult, start, end ); + else + return bli_thread_get_range_r2l( thr, a, bmult, start, end ); +} + siz_t bli_thread_get_range_weighted_l2r ( thrinfo_t* thr, @@ -656,13 +731,20 @@ siz_t bli_thread_get_range_weighted_l2r bli_reflect_about_diag( diagoff, uplo, m, n ); } - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - FALSE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + FALSE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_l2r( thr, a, bmult, - start, end ); + area = bli_thread_get_range_l2r + ( + thr, a, bmult, + start, end + ); } return area; @@ -700,13 +782,20 @@ siz_t bli_thread_get_range_weighted_r2l bli_rotate180_trapezoid( diagoff, uplo ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - TRUE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + TRUE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_r2l( thr, a, bmult, - start, end ); + area = bli_thread_get_range_r2l + ( + thr, a, bmult, + start, end + ); } return area; @@ -744,13 +833,20 @@ siz_t bli_thread_get_range_weighted_t2b bli_reflect_about_diag( diagoff, uplo, m, n ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - FALSE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + FALSE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_t2b( thr, a, bmult, - start, end ); + area = bli_thread_get_range_t2b + ( + thr, a, bmult, + start, end + ); } return area; @@ -790,18 +886,25 @@ siz_t bli_thread_get_range_weighted_b2t bli_rotate180_trapezoid( diagoff, uplo ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - TRUE, start, end ); + area = bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + TRUE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_b2t( thr, a, bmult, - start, end ); + area = bli_thread_get_range_b2t + ( + thr, a, bmult, + start, end + ); } return area; } +// ----------------------------------------------------------------------------- // Some utilities dim_t bli_env_read_nway( char* env ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 6ef2ebb1a..478d12aa6 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -68,7 +68,7 @@ void bli_thread_finalize( void ); bool_t bli_thread_is_initialized( void ); // Thread range-related prototypes. -void bli_thread_get_range +void bli_thread_get_range_sub ( thrinfo_t* thread, dim_t n, @@ -78,6 +78,24 @@ void bli_thread_get_range dim_t* end ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +siz_t PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + thrinfo_t* thr, \ + obj_t* a, \ + blksz_t* bmult, \ + dim_t* start, \ + dim_t* end \ + ); + +GENPROT( thread_get_range_mdim ) +GENPROT( thread_get_range_ndim ) +GENPROT( thread_get_range_weighted_mdim ) +GENPROT( thread_get_range_weighted_ndim ) + #undef GENPROT #define GENPROT( opname ) \ \ @@ -119,7 +137,7 @@ siz_t bli_find_area_trap_l dim_t n, doff_t diagoff ); -siz_t bli_thread_get_range_weighted +siz_t bli_thread_get_range_weighted_sub ( thrinfo_t* thread, doff_t diagoff, From a017062fdf763037da9d971a028bb07d47aa1c8a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 22 Jul 2016 17:02:59 -0500 Subject: [PATCH 02/27] Integrated "memory broker" (membrk_t) abstraction. Details: - Integrated a patch originally authored and submitted by Ricardo Magana of HP Enterprise. The changeset inserts use of a new object type, membrk_t, (memory broker) that allows multiple sets of memory pools on, for example, separate NUMA nodes, each of which has a separate memory space. - Added membrk field to cntx_t and defined corresponding accessor macros. - Added membrk field to mem_t object and defined corresponding accessor macros. - Created new bli_membrk.c file, which contains the new memory broker API, including: bli_membrk_init(), bli_membrk_finalize() bli_membrk_acquire_[mv](), bli_membrk_release(), bli_membrk_init_pools(), bli_membrk_reinit_pools(), bli_membrk_finalize_pools(), bli_membrk_pool_size() - In bli_mem.c, changed function calls to bli_mem_init_pools() -> bli_membrk_init() bli_mem_reinit_pools() -> bli_membrk_reinit() bli_mem_finalize_pools() -> bli_membrk_finalize() - In bli_packv_init.c, bli_packm_init.c, changed function calls to: bli_mem_acquire_[mv]() -> bli_membrk_acquire_[mv]() bli_mem_release() -> bli_membrk_release() - Added bli_mutex.c and related files to frame/thread. These files define abstract mutexes (locks) and corresponding APIs for pthreads, openmp, or single-threaded execution. This new API is employed within functions such as bli_membrk_acquire_[mv]() and bli_membrk_release(). --- frame/1/packv/bli_packv_init.c | 34 +- frame/1m/packm/bli_packm_cntx.c | 4 + frame/1m/packm/bli_packm_init.c | 53 +-- frame/base/bli_cntx.h | 40 +- frame/base/bli_mem.c | 495 +----------------------- frame/base/bli_mem.h | 39 +- frame/base/bli_membrk.c | 578 +++++++++++++++++++++++++++++ frame/base/bli_membrk.h | 169 +++++++++ frame/include/bli_mem_macro_defs.h | 15 +- frame/include/bli_obj_macro_defs.h | 5 +- frame/include/bli_type_defs.h | 21 +- frame/include/blis.h | 2 + frame/thread/bli_mutex.h | 49 +++ frame/thread/bli_mutex_openmp.h | 72 ++++ frame/thread/bli_mutex_pthreads.h | 72 ++++ frame/thread/bli_mutex_single.h | 65 ++++ frame/thread/bli_thread.h | 4 + 17 files changed, 1156 insertions(+), 561 deletions(-) create mode 100644 frame/base/bli_membrk.c create mode 100644 frame/base/bli_membrk.h create mode 100644 frame/thread/bli_mutex.h create mode 100644 frame/thread/bli_mutex_openmp.h create mode 100644 frame/thread/bli_mutex_pthreads.h create mode 100644 frame/thread/bli_mutex_single.h diff --git a/frame/1/packv/bli_packv_init.c b/frame/1/packv/bli_packv_init.c index 5d8a10b98..c43931272 100644 --- a/frame/1/packv/bli_packv_init.c +++ b/frame/1/packv/bli_packv_init.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -43,6 +44,7 @@ void bli_packv_init ) { // The purpose of packm_init() is to initialize an object P so that + // a source object A can be packed into P via one of the packv // implementations. This initialization includes acquiring a suitable // block of memory from the memory allocator, if such a block of memory @@ -132,15 +134,17 @@ void bli_packv_init_pack cntx_t* cntx ) { - num_t dt = bli_obj_datatype( *c ); - dim_t dim_c = bli_obj_vector_dim( *c ); - dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); + num_t dt = bli_obj_datatype( *c ); + dim_t dim_c = bli_obj_vector_dim( *c ); + dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); - mem_t* mem_p; - dim_t m_p_pad; - siz_t size_p; - inc_t rs_p, cs_p; - void* buf; + membrk_t* membrk = bli_cntx_membrk( cntx ); + + mem_t* mem_p; + dim_t m_p_pad; + siz_t size_p; + inc_t rs_p, cs_p; + void* buf; // We begin by copying the basic fields of c. @@ -170,8 +174,9 @@ void bli_packv_init_pack { // If the mem_t object of p has not yet been allocated, then acquire // a memory block suitable for a vector. - bli_mem_acquire_v( size_p, - mem_p ); + bli_membrk_acquire_v( membrk, + size_p, + mem_p ); } else { @@ -179,10 +184,11 @@ void bli_packv_init_pack // re-acquire the memory so there is sufficient space. if ( bli_mem_size( mem_p ) < size_p ) { - bli_mem_release( mem_p ); + bli_membrk_release( mem_p ); - bli_mem_acquire_v( size_p, - mem_p ); + bli_membrk_acquire_v( membrk, + size_p, + mem_p ); } } @@ -218,7 +224,7 @@ void bli_packv_release ) { if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); + bli_obj_release_pack( p ); } diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 787531f41..d42abfd62 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -49,6 +50,9 @@ void bli_packm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx ); bli_gks_cntx_set_l1v_ker( BLIS_SCAL2V_KER, cntx ); bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx ); + + // Initialize the context with the global membrk object. + bli_cntx_set_membrk( bli_mem_global_membrk(), cntx ); } void bli_packm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index cb6f28fe2..c33a0410e 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -202,23 +203,25 @@ void bli_packm_init_pack( invdiag_t invert_diag, obj_t* p, cntx_t* cntx ) { - num_t dt = bli_obj_datatype( *c ); - trans_t transc = bli_obj_onlytrans_status( *c ); - dim_t m_c = bli_obj_length( *c ); - dim_t n_c = bli_obj_width( *c ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); + num_t dt = bli_obj_datatype( *c ); + trans_t transc = bli_obj_onlytrans_status( *c ); + dim_t m_c = bli_obj_length( *c ); + dim_t n_c = bli_obj_width( *c ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); + dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); - mem_t* mem_p; - dim_t m_p, n_p; - dim_t m_p_pad, n_p_pad; - siz_t size_p; - siz_t elem_size_p; - inc_t rs_p, cs_p; - inc_t is_p; - void* buf; + membrk_t* membrk = bli_cntx_get_membrk( cntx ); + + mem_t* mem_p; + dim_t m_p, n_p; + dim_t m_p_pad, n_p_pad; + siz_t size_p; + siz_t elem_size_p; + inc_t rs_p, cs_p; + inc_t is_p; + void* buf; // We begin by copying the basic fields of c. We do NOT copy the @@ -549,9 +552,10 @@ void bli_packm_init_pack( invdiag_t invert_diag, { // If the mem_t object of p has not yet been allocated, then acquire // a memory block of type pack_buf_type. - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); + bli_membrk_acquire_m( membrk, + size_p, + pack_buf_type, + mem_p ); } else { @@ -562,10 +566,11 @@ void bli_packm_init_pack( invdiag_t invert_diag, // pack_buf_type value. if ( bli_mem_size( mem_p ) < size_p ) { - bli_mem_release( mem_p ); - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); + bli_membrk_release( mem_p ); + bli_membrk_acquire_m( membrk, + size_p, + pack_buf_type, + mem_p ); } } @@ -582,7 +587,7 @@ void bli_packm_release( obj_t* p, packm_t* cntl ) { if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); + bli_obj_release_pack( p ); } diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 5635ddc88..337d233b3 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -57,6 +58,7 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + membrk_t* membrk; } cntx_t; */ @@ -116,66 +118,75 @@ typedef struct cntx_s \ ( (cntx)->schema_c ) +#define bli_cntx_membrk( cntx ) \ +\ + ( (cntx)->membrk ) + // cntx_t modification (fields only) #define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \ { \ - (cntx_p)->blkszs = _blkszs; \ + (cntx_p)->blkszs = _blkszs; \ } #define bli_cntx_set_bmults_buf( _bmults, cntx_p ) \ { \ - (cntx_p)->bmults = _bmults; \ + (cntx_p)->bmults = _bmults; \ } #define bli_cntx_set_l3_vir_ukrs_buf( _l3_vir_ukrs, cntx_p ) \ { \ - (cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \ + (cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \ } #define bli_cntx_set_l3_nat_ukrs_buf( _l3_nat_ukrs, cntx_p ) \ { \ - (cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \ + (cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \ } #define bli_cntx_set_l3_nat_ukrs_prefs_buf( _l3_nat_ukrs_prefs, cntx_p ) \ { \ - (cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \ + (cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \ } #define bli_cntx_set_l1f_kers_buf( _l1f_kers, cntx_p ) \ { \ - (cntx_p)->l1f_kers = _l1f_kers; \ + (cntx_p)->l1f_kers = _l1f_kers; \ } #define bli_cntx_set_l1v_kers_buf( _l1v_kers, cntx_p ) \ { \ - (cntx_p)->l1v_kers = _l1v_kers; \ + (cntx_p)->l1v_kers = _l1v_kers; \ } #define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \ { \ - (cntx_p)->packm_ukrs = _packm_ukrs; \ + (cntx_p)->packm_ukrs = _packm_ukrs; \ } #define bli_cntx_set_method( _method, cntx_p ) \ { \ - (cntx_p)->method = _method; \ + (cntx_p)->method = _method; \ } #define bli_cntx_set_schema_a( _schema_a, cntx_p ) \ { \ - (cntx_p)->schema_a = _schema_a; \ + (cntx_p)->schema_a = _schema_a; \ } #define bli_cntx_set_schema_b( _schema_b, cntx_p ) \ { \ - (cntx_p)->schema_b = _schema_b; \ + (cntx_p)->schema_b = _schema_b; \ } #define bli_cntx_set_schema_c( _schema_c, cntx_p ) \ { \ - (cntx_p)->schema_c = _schema_c; \ + (cntx_p)->schema_c = _schema_c; \ +} + +#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +{ \ + (cntx_p)->membrk = _membrk; \ } // cntx_t query (complex) @@ -264,6 +275,11 @@ typedef struct cntx_s \ bli_cntx_schema_b( cntx ) +#define bli_cntx_get_membrk( cntx ) \ +\ + bli_cntx_membrk( cntx ) + + // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 25530b1ed..83b936aae 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,207 +39,15 @@ pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; #endif -// Declare one memory pool structure for each block size/shape we want to -// be able to allocate. -static pool_t pools[3]; - - +static membrk_t global_membrk; // ----------------------------------------------------------------------------- -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ) +membrk_t* bli_mem_global_membrk( void ) { - pool_t* pool; - pblk_t* pblk; - dim_t pi; - siz_t block_size; - - // Make sure the API is initialized. - bli_mem_init(); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // For general-use buffer requests, such as those used by level-2 - // operations, dynamically allocating memory is sufficient. - void* buf_sys = bli_malloc_pool( req_size ); - - // Initialize the mem_t object with: - // - the address of the memory block, - // - the buffer type (a packbuf_t value), and - // - the size of the requested region. - // NOTE: We do not initialize the pool field since this block did not - // come from a memory pool. - bli_mem_set_buffer( buf_sys, mem ); - bli_mem_set_buf_sys( buf_sys, mem ); - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_size( req_size, mem ); - } - else - { - // This branch handles cases where the memory block needs to come - // from an internal memory pool, in which blocks are allocated once - // and then recycled. - - // Map the requested packed buffer type to a zero-based index, which - // we then use to select the corresponding memory pool. - pi = bli_packbuf_index( buf_type ); - pool = &pools[ pi ]; - - // Unconditionally perform error checking on the memory pool. - { - err_t e_val; - - // Make sure that the requested matrix size fits inside of a block - // of the corresponding pool. If it does not, the pool was somehow - // initialized improperly. - e_val = bli_check_requested_block_size_for_pool( req_size, pool ); - bli_check_error_code( e_val ); - } - - // Extract the address of the pblk_t struct within the mem_t. - pblk = bli_mem_pblk( mem ); - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - // BEGIN CRITICAL SECTION - { - - // Checkout a block from the pool. If the pool is exhausted, - // either because it is still empty or because all blocks have - // been checked out already, additional blocks will be allocated - // automatically, as-needed. Note that the addresses are stored - // directly into the mem_t struct since pblk is the address of - // the struct's pblk_t field. - bli_pool_checkout_block( pblk, pool ); - - // Query the size of the blocks in the pool so we can store it in - // the mem_t object. At this point, it is guaranteed to be at - // least as large as req_size. (NOTE: We must perform the query - // within the critical section to ensure that the pool hasn't - // changed, as unlikely as that would be.) - block_size = bli_pool_block_size( pool ); - - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - - // Initialize the mem_t object with: - // - the buffer type (a packbuf_t value), - // - the address of the memory pool to which it belongs, and - // - the size of the contiguous memory block (NOT the size of the - // requested region). - // The actual addresses (system and aligned) are already stored in - // the mem_t struct's pblk_t field - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_pool( pool, mem ); - bli_mem_set_size( block_size, mem ); - } + return &global_membrk; } - -void bli_mem_release( mem_t* mem ) -{ - packbuf_t buf_type; - pool_t* pool; - pblk_t* pblk; - siz_t block_size_cur; - siz_t block_size_prev; - - // Make sure the API is initialized. - bli_mem_init(); - - // Extract the buffer type so we know what kind of memory was allocated. - buf_type = bli_mem_buf_type( mem ); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - void* buf_sys = bli_mem_buf_sys( mem ); - - // For general-use buffers, we dynamically allocate memory, and so - // here we need to free. - bli_free_pool( buf_sys ); - } - else - { - // Extract the address of the pool from which the memory was - // allocated. - pool = bli_mem_pool( mem ); - - // Extract the address of the pblk_t struct within the mem_t struct. - pblk = bli_mem_pblk( mem ); - - // Query the size of the blocks that were in the pool at the time - // the pblk_t was checked out. (This is used below, in the critical - // section.) - block_size_prev = bli_mem_size( mem ); - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - - // Query the size of the blocks currently in the pool. - block_size_cur = bli_pool_block_size( pool ); - - // If the block size of the pool has changed since the pblk_t - // was checked out, then we need to free the pblk_t rather - // than check it back in. Why? Because the pool's block size - // has (most likely) increased to meet changing needs (example: - // larger cache blocksizes). Thus, the current pblk_t's smaller - // allocated size is of no use anymore. - if ( block_size_cur != block_size_prev ) - { - // Free the pblk_t using the appropriate function in the - // pool API. - bli_pool_free_block( pblk ); - } - else - { - // Check the block back into the pool. - bli_pool_checkin_block( pblk, pool ); - } - - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - } - - // Clear the mem_t object so that it appears unallocated. This clears: - // - the pblk_t struct's fields (ie: the buffer addresses) - // - the pool field - // - the size field - // NOTE: We do not clear the buf_type field since there is no - // "uninitialized" value for packbuf_t. - bli_mem_clear( mem ); -} - - -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ) -{ - bli_mem_acquire_m( req_size, - BLIS_BUFFER_FOR_GEN_USE, - mem ); -} - - siz_t bli_mem_pool_size( packbuf_t buf_type ) { siz_t r_val; @@ -251,15 +60,15 @@ siz_t bli_mem_pool_size( packbuf_t buf_type ) } else { - dim_t index; + dim_t pool_index; pool_t* pool; // Acquire the pointer to the pool corresponding to the buf_type // provided. - index = bli_packbuf_index( buf_type ); - pool = &(pools[index]); + pool_index = bli_packbuf_index( buf_type ); + pool = bli_membrk_pool( pool_index, &global_membrk ); - // Compute the pool "size" as the product of the block size + // Compute the pool "size" as the product of the block size // and the number of blocks in the pool. r_val = bli_pool_block_size( pool ) * bli_pool_num_blocks( pool ); @@ -300,8 +109,8 @@ void bli_mem_init( void ) // critical section. if ( bli_mem_is_init == FALSE ) { - // Initialize the memory pools. - bli_mem_init_pools( &cntx ); + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( &cntx, &global_membrk ); // After initialization, mark the API as initialized. bli_mem_is_init = TRUE; @@ -332,16 +141,16 @@ void bli_mem_reinit( cntx_t* cntx ) // initialized (unlikely), we emulate the body of bli_mem_init(). if ( bli_mem_is_init == FALSE ) { - // Initialize the memory pools. - bli_mem_init_pools( cntx ); + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( cntx, &global_membrk ); // After initialization, mark the API as initialized. bli_mem_is_init = TRUE; } else { - // Reinitialize the memory pools. - bli_mem_reinit_pools( cntx ); + // Reinitialize the global membrk_t object's memory pools. + bli_membrk_reinit_pools( cntx, &global_membrk ); } } // END CRITICAL SECTION @@ -373,8 +182,8 @@ void bli_mem_finalize( void ) // critical section. if ( bli_mem_is_init == TRUE ) { - // Finalize the memory pools. - bli_mem_finalize_pools(); + // Finalize the global membrk_t object and its memory pools. + bli_membrk_finalize( &global_membrk ); // After finalization, mark the API as uninitialized. bli_mem_is_init = FALSE; @@ -392,275 +201,3 @@ bool_t bli_mem_is_initialized( void ) return bli_mem_is_init; } -// ----------------------------------------------------------------------------- - -void bli_mem_init_pools( cntx_t* cntx ) -{ - // Map each of the packbuf_t values to an index starting at zero. - const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; - - // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; - - // Start with empty pools. - const dim_t num_blocks_a = 0; - const dim_t num_blocks_b = 0; - const dim_t num_blocks_c = 0; - - siz_t block_size_a = 0; - siz_t block_size_b = 0; - siz_t block_size_c = 0; - - // Determine the block size for each memory pool. - bli_mem_compute_pool_block_sizes( &block_size_a, - &block_size_b, - &block_size_c, - cntx ); - - // Initialize the memory pools for A, B, and C. - bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a ); - bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b ); - bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c ); -} - -void bli_mem_reinit_pools( cntx_t* cntx ) -{ - // Map each of the packbuf_t values to an index starting at zero. - const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; - - // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; - - // Query the number of blocks currently allocated in each pool. - const dim_t num_blocks_a = bli_pool_num_blocks( pool_a ); - const dim_t num_blocks_b = bli_pool_num_blocks( pool_b ); - const dim_t num_blocks_c = bli_pool_num_blocks( pool_c ); - - siz_t block_size_a_new = 0; - siz_t block_size_b_new = 0; - siz_t block_size_c_new = 0; - - // Determine the context-implied block size needed for each pool. - bli_mem_compute_pool_block_sizes( &block_size_a_new, - &block_size_b_new, - &block_size_c_new, - cntx ); - - // Reinitialize the pool, but only if one of the parameters has - // changed in such a way that reinitialization would be required. - // In this case, the align_size is constant, as is num_blocks, so - // what this actually boils down to is that reinitialization of a - // pool occurs only if the block size for that pool has increased. - bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a ); - bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b ); - bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c ); -} - -void bli_mem_finalize_pools( void ) -{ - // Map each of the packbuf_t values to an index starting at zero. - dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; - - // Finalize the memory pools for A, B, and C. - bli_pool_finalize( pool_a ); - bli_pool_finalize( pool_b ); - bli_pool_finalize( pool_c ); -} - -// ----------------------------------------------------------------------------- - -void bli_mem_compute_pool_block_sizes( siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ) -{ - const ind_t im = bli_cntx_get_ind_method( cntx ); - - siz_t bs_cand_a = 0; - siz_t bs_cand_b = 0; - siz_t bs_cand_c = 0; - - num_t dt; - - // Compute pool block sizes for each datatype and find the maximum - // size for each pool. This is done so that new pools do not need - // to be allocated if the user switches datatypes. - for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) - { - siz_t bs_dt_a; - siz_t bs_dt_b; - siz_t bs_dt_c; - - // Avoid considering induced methods for real datatypes. - if ( bli_is_real( dt ) && im != BLIS_NAT ) continue; - - bli_mem_compute_pool_block_sizes_dt( dt, - &bs_dt_a, - &bs_dt_b, - &bs_dt_c, - cntx ); - - bs_cand_a = bli_max( bs_dt_a, bs_cand_a ); - bs_cand_b = bli_max( bs_dt_b, bs_cand_b ); - bs_cand_c = bli_max( bs_dt_c, bs_cand_c ); - } - - // Save the results. - *bs_a = bs_cand_a; - *bs_b = bs_cand_b; - *bs_c = bs_cand_c; -} - -// ----------------------------------------------------------------------------- - -void bli_mem_compute_pool_block_sizes_dt( num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ) -{ - siz_t size_dt = bli_datatype_size( dt ); - - blksz_t* mr; - blksz_t* nr; - - blksz_t* mc; - blksz_t* kc; - blksz_t* nc; - - dim_t mr_dt; - dim_t nr_dt; - dim_t max_mnr_dt; - - dim_t mc_max_dt; - dim_t kc_max_dt; - dim_t nc_max_dt; - - dim_t packmr_dt; - dim_t packnr_dt; - dim_t max_packmnr_dt; - - dim_t scale_num_dt; - dim_t scale_den_dt; - - dim_t pool_mc_dt, left_mc_dt; - dim_t pool_nc_dt, left_nc_dt; - dim_t pool_kc_dt; - - // - // Find the larger of the two register blocksizes. - // - - // Query the mr and nr blksz_t objects for the given method of - // execution. - mr = bli_cntx_get_blksz( BLIS_MR, cntx ); - nr = bli_cntx_get_blksz( BLIS_NR, cntx ); - - // Extract the mr and nr values specific to the current datatype. - mr_dt = bli_blksz_get_def( dt, mr ); - nr_dt = bli_blksz_get_def( dt, nr ); - - // Find the maximum of mr and nr. - max_mnr_dt = bli_max( mr_dt, nr_dt ); - - // - // Define local maximum cache blocksizes. - // - - // Query the mc, kc, and nc blksz_t objects for native execution. - mc = bli_cntx_get_blksz( BLIS_MC, cntx ); - kc = bli_cntx_get_blksz( BLIS_KC, cntx ); - nc = bli_cntx_get_blksz( BLIS_NC, cntx ); - - // Extract the maximum mc, kc, and nc values specific to the current - // datatype. - mc_max_dt = bli_blksz_get_max( dt, mc ); - kc_max_dt = bli_blksz_get_max( dt, kc ); - nc_max_dt = bli_blksz_get_max( dt, nc ); - - // Add max(mr,nr) to kc to make room for the nudging of kc at - // runtime to be a multiple of mr or nr for triangular operations - // trmm, trmm3, and trsm. - kc_max_dt += max_mnr_dt; - - // - // Compute scaling factors. - // - - // Compute integer scaling factors (numerator and denominator) used - // to account for situations when the packing register blocksizes are - // larger than the regular register blocksizes. - - // In order to compute the scaling factors, we first have to determine - // whether ( packmr / mr ) is greater than ( packnr / nr ). This is - // needed ONLY because the amount of space allocated for a block of A - // and a panel of B needs to be such that MR and NR can be swapped (ie: - // A is packed with NR and B is packed with MR). This transformation is - // needed for right-side trsm when inducing an algorithm that (a) has - // favorable access patterns for column-stored C and (b) allows the - // macro-kernel to reuse the existing left-side fused gemmtrsm micro- - // kernels. We avoid integer division by cross-multiplying: - // - // ( packmr / mr ) >= ( packnr / nr ) - // ( packmr / mr ) * nr >= packnr - // packmr * nr >= packnr * mr - // - // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as - // our scaling factors. Otherwise, we'll use packnr and nr. - - packmr_dt = bli_blksz_get_max( dt, mr ); - packnr_dt = bli_blksz_get_max( dt, nr ); - - if ( packmr_dt * nr_dt >= - packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; - scale_den_dt = mr_dt; } - else { scale_num_dt = packnr_dt; - scale_den_dt = nr_dt; } - - // - // Compute pool block dimensions. - // - - pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; - left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; - - pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; - left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; - - pool_kc_dt = ( kc_max_dt ); - - if ( left_mc_dt > 0 ) pool_mc_dt += 1; - if ( left_nc_dt > 0 ) pool_nc_dt += 1; - - // - // Compute pool block sizes - // - - // We add an extra micro-panel of space to the block sizes for A and B - // just to be sure any pre-loading performed by the micro-kernel does - // not cause a segmentation fault. - max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); - - *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; - *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; - *bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt; -} diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index 8d6d71501..9ef741934 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,37 +33,21 @@ */ +#ifndef BLIS_MEM_H +#define BLIS_MEM_H + +// ----------------------------------------------------------------------------- + +membrk_t* bli_mem_global_membrk( void ); +siz_t bli_mem_pool_size( packbuf_t buf_type ); + +// ----------------------------------------------------------------------------- + void bli_mem_init( void ); void bli_mem_reinit( cntx_t* cntx ); void bli_mem_finalize( void ); bool_t bli_mem_is_initialized( void ); -// ----------------------------------------------------------------------------- -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ); - -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ); - -void bli_mem_release( mem_t* mem ); - -siz_t bli_mem_pool_size( packbuf_t buf_type ); - -// ----------------------------------------------------------------------------- - -void bli_mem_init_pools( cntx_t* cntx ); -void bli_mem_reinit_pools( cntx_t* cntx ); -void bli_mem_finalize_pools( void ); - -void bli_mem_compute_pool_block_sizes( siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ); -void bli_mem_compute_pool_block_sizes_dt( num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ); +#endif diff --git a/frame/base/bli_membrk.c b/frame/base/bli_membrk.c new file mode 100644 index 000000000..33a998de1 --- /dev/null +++ b/frame/base/bli_membrk.c @@ -0,0 +1,578 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_membrk_init + ( + cntx_t* cntx, + membrk_t* membrk + ) +{ + bli_mutex_init( bli_membrk_mutex( membrk ) ); + bli_membrk_init_pools( cntx, membrk ); + bli_membrk_set_malloc_fp( bli_malloc_pool, membrk ); +} + +void bli_membrk_finalize + ( + membrk_t* membrk + ) +{ + bli_membrk_set_malloc_fp( NULL, membrk ); + bli_membrk_finalize_pools( membrk ); + bli_mutex_finalize( bli_membrk_mutex( membrk ) ); +} + +void bli_membrk_acquire_m + ( + membrk_t* membrk, + siz_t req_size, + packbuf_t buf_type, + mem_t* mem + ) +{ + pool_t* pool; + pblk_t* pblk; + dim_t pi; + siz_t block_size; + + // Make sure the API is initialized. + //assert( membrk ); //?? + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffer requests, such as those used by level-2 + // operations, dynamically allocating memory is sufficient. + // Note that we use the malloc()-style memory allocation function + // that is stored in the membrk_t object. + void* buf_sys = bli_membrk_malloc( req_size, membrk ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), + // - the size of the requested region, + // - the membrk_t from which the mem_t entry was acquired. + // NOTE: We do not initialize the pool field since this block did not + // come from a memory pool. + bli_mem_set_buffer( buf_sys, mem ); + bli_mem_set_buf_sys( buf_sys, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_size( req_size, mem ); + bli_mem_set_membrk( membrk, mem ); + } + else + { + // This branch handles cases where the memory block needs to come + // from an internal memory pool, in which blocks are allocated once + // and then recycled. + + // Map the requested packed buffer type to a zero-based index, which + // we then use to select the corresponding memory pool. + pi = bli_packbuf_index( buf_type ); + pool = bli_membrk_pool( pi, membrk ); + + // Unconditionally perform error checking on the memory pool. + { + err_t e_val; + + // Make sure that the requested matrix size fits inside of a block + // of the corresponding pool. If it does not, the pool was somehow + // initialized improperly. + e_val = bli_check_requested_block_size_for_pool( req_size, pool ); + bli_check_error_code( e_val ); + } + + // Extract the address of the pblk_t struct within the mem_t. + pblk = bli_mem_pblk( mem ); + + // BEGIN CRITICAL SECTION + bli_membrk_lock( membrk ); + { + + // Checkout a block from the pool. If the pool is exhausted, + // either because it is still empty or because all blocks have + // been checked out already, additional blocks will be allocated + // automatically, as-needed. Note that the addresses are stored + // directly into the mem_t struct since pblk is the address of + // the struct's pblk_t field. + bli_pool_checkout_block( pblk, pool ); + + // Query the size of the blocks in the pool so we can store it in + // the mem_t object. At this point, it is guaranteed to be at + // least as large as req_size. (NOTE: We must perform the query + // within the critical section to ensure that the pool hasn't + // changed, as unlikely as that would be.) + block_size = bli_pool_block_size( pool ); + + } + bli_membrk_unlock( membrk ); + // END CRITICAL SECTION + + // Initialize the mem_t object with: + // - the buffer type (a packbuf_t value), + // - the address of the memory pool to which it belongs, + // - the size of the contiguous memory block (NOT the size of the + // requested region), + // - the membrk_t from which the mem_t entry was acquired. + // The actual addresses (system and aligned) are already stored in + // the mem_t struct's pblk_t field + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_pool( pool, mem ); + bli_mem_set_size( block_size, mem ); + bli_mem_set_membrk( membrk, mem ); + } +} + + +void bli_membrk_release + ( + mem_t* mem + ) +{ + packbuf_t buf_type; + pool_t* pool; + pblk_t* pblk; + siz_t block_size_cur; + siz_t block_size_prev; + membrk_t* membrk; + + // Extract the membrk_t address from the mem_t object. + membrk = bli_mem_membrk( mem ); + + // Extract the buffer type so we know what kind of memory was allocated. + buf_type = bli_mem_buf_type( mem ); + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + void* buf_sys = bli_mem_buf_sys( mem ); + + // For general-use buffers, we dynamically allocate memory, and so + // here we need to free. + // Note that we use the free()-style memory release function that + // is stored in the membrk_t object. + bli_membrk_free( buf_sys, membrk ); + } + else + { + // Extract the address of the pool from which the memory was + // allocated. + pool = bli_mem_pool( mem ); + + // Extract the address of the pblk_t struct within the mem_t struct. + pblk = bli_mem_pblk( mem ); + + // Query the size of the blocks that were in the pool at the time + // the pblk_t was checked out. (This is used below, in the critical + // section.) + block_size_prev = bli_mem_size( mem ); + + // BEGIN CRITICAL SECTION + bli_membrk_lock( membrk ); + { + + // Query the size of the blocks currently in the pool. + block_size_cur = bli_pool_block_size( pool ); + + // If the block size of the pool has changed since the pblk_t + // was checked out, then we need to free the pblk_t rather + // than check it back in. Why? Because the pool's block size + // has (most likely) increased to meet changing needs (example: + // larger cache blocksizes). Thus, the current pblk_t's smaller + // allocated size is of no use anymore. + if ( block_size_cur != block_size_prev ) + { + // Free the pblk_t using the appropriate function in the + // pool API. + bli_pool_free_block( pblk ); + } + else + { + // Check the block back into the pool. + bli_pool_checkin_block( pblk, pool ); + } + + } + bli_membrk_unlock( membrk ); + // END CRITICAL SECTION + } + + // Clear the mem_t object so that it appears unallocated. This clears: + // - the pblk_t struct's fields (ie: the buffer addresses) + // - the pool field + // - the size field + // - the membrk field + // NOTE: We do not clear the buf_type field since there is no + // "uninitialized" value for packbuf_t. + bli_mem_clear( mem ); +} + + +void bli_membrk_acquire_v + ( + membrk_t* membrk, + siz_t req_size, + mem_t* mem + ) +{ + bli_membrk_acquire_m( membrk, + req_size, + BLIS_BUFFER_FOR_GEN_USE, + mem ); +} + + +siz_t bli_membrk_pool_size + ( + membrk_t* membrk, + packbuf_t buf_type + ) +{ + siz_t r_val; + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // We don't (yet) track the amount of general-purpose + // memory that is currently allocated. + r_val = 0; + } + else + { + dim_t pool_index; + pool_t* pool; + + // Acquire the pointer to the pool corresponding to the buf_type + // provided. + pool_index = bli_packbuf_index( buf_type ); + pool = bli_membrk_pool( pool_index, membrk ); + + // Compute the pool "size" as the product of the block size + // and the number of blocks in the pool. + r_val = bli_pool_block_size( pool ) * + bli_pool_num_blocks( pool ); + } + + return r_val; +} + +// ----------------------------------------------------------------------------- + +void bli_membrk_init_pools + ( + cntx_t* cntx, + membrk_t* membrk + ) +{ + // Map each of the packbuf_t values to an index starting at zero. + const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; + + // Alias the pool addresses to convenient identifiers. + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + + // Start with empty pools. + const dim_t num_blocks_a = 0; + const dim_t num_blocks_b = 0; + const dim_t num_blocks_c = 0; + + siz_t block_size_a = 0; + siz_t block_size_b = 0; + siz_t block_size_c = 0; + + // Determine the block size for each memory pool. + bli_membrk_compute_pool_block_sizes( &block_size_a, + &block_size_b, + &block_size_c, + cntx ); + + // Initialize the memory pools for A, B, and C. + bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a ); + bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b ); + bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c ); +} + +void bli_membrk_reinit_pools + ( + cntx_t* cntx, + membrk_t* membrk + ) +{ + // Map each of the packbuf_t values to an index starting at zero. + const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; + + // Alias the pool addresses to convenient identifiers. + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + + // Query the number of blocks currently allocated in each pool. + const dim_t num_blocks_a = bli_pool_num_blocks( pool_a ); + const dim_t num_blocks_b = bli_pool_num_blocks( pool_b ); + const dim_t num_blocks_c = bli_pool_num_blocks( pool_c ); + + siz_t block_size_a_new = 0; + siz_t block_size_b_new = 0; + siz_t block_size_c_new = 0; + + // Determine the context-implied block size needed for each pool. + bli_membrk_compute_pool_block_sizes( &block_size_a_new, + &block_size_b_new, + &block_size_c_new, + cntx ); + + // Reinitialize the pool, but only if one of the parameters has + // changed in such a way that reinitialization would be required. + // In this case, the align_size is constant, as is num_blocks, so + // what this actually boils down to is that reinitialization of a + // pool occurs only if the block size for that pool has increased. + bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a ); + bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b ); + bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c ); +} + +void bli_membrk_finalize_pools + ( + membrk_t* membrk + ) +{ + // Map each of the packbuf_t values to an index starting at zero. + dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + // Alias the pool addresses to convenient identifiers. + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + + // Finalize the memory pools for A, B, and C. + bli_pool_finalize( pool_a ); + bli_pool_finalize( pool_b ); + bli_pool_finalize( pool_c ); +} + +// ----------------------------------------------------------------------------- + +void bli_membrk_compute_pool_block_sizes + ( + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ) +{ + const ind_t im = bli_cntx_get_ind_method( cntx ); + + siz_t bs_cand_a = 0; + siz_t bs_cand_b = 0; + siz_t bs_cand_c = 0; + + num_t dt; + + // Compute pool block sizes for each datatype and find the maximum + // size for each pool. This is done so that new pools do not need + // to be allocated if the user switches datatypes. + for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + { + siz_t bs_dt_a; + siz_t bs_dt_b; + siz_t bs_dt_c; + + // Avoid considering induced methods for real datatypes. + if ( bli_is_real( dt ) && im != BLIS_NAT ) continue; + + bli_membrk_compute_pool_block_sizes_dt( dt, + &bs_dt_a, + &bs_dt_b, + &bs_dt_c, + cntx ); + + bs_cand_a = bli_max( bs_dt_a, bs_cand_a ); + bs_cand_b = bli_max( bs_dt_b, bs_cand_b ); + bs_cand_c = bli_max( bs_dt_c, bs_cand_c ); + } + + // Save the results. + *bs_a = bs_cand_a; + *bs_b = bs_cand_b; + *bs_c = bs_cand_c; +} + +// ----------------------------------------------------------------------------- + +void bli_membrk_compute_pool_block_sizes_dt + ( + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ) +{ + siz_t size_dt = bli_datatype_size( dt ); + + blksz_t* mr; + blksz_t* nr; + + blksz_t* mc; + blksz_t* kc; + blksz_t* nc; + + dim_t mr_dt; + dim_t nr_dt; + dim_t max_mnr_dt; + + dim_t mc_max_dt; + dim_t kc_max_dt; + dim_t nc_max_dt; + + dim_t packmr_dt; + dim_t packnr_dt; + dim_t max_packmnr_dt; + + dim_t scale_num_dt; + dim_t scale_den_dt; + + dim_t pool_mc_dt, left_mc_dt; + dim_t pool_nc_dt, left_nc_dt; + dim_t pool_kc_dt; + + // + // Find the larger of the two register blocksizes. + // + + // Query the mr and nr blksz_t objects for the given method of + // execution. + mr = bli_cntx_get_blksz( BLIS_MR, cntx ); + nr = bli_cntx_get_blksz( BLIS_NR, cntx ); + + // Extract the mr and nr values specific to the current datatype. + mr_dt = bli_blksz_get_def( dt, mr ); + nr_dt = bli_blksz_get_def( dt, nr ); + + // Find the maximum of mr and nr. + max_mnr_dt = bli_max( mr_dt, nr_dt ); + + // + // Define local maximum cache blocksizes. + // + + // Query the mc, kc, and nc blksz_t objects for native execution. + mc = bli_cntx_get_blksz( BLIS_MC, cntx ); + kc = bli_cntx_get_blksz( BLIS_KC, cntx ); + nc = bli_cntx_get_blksz( BLIS_NC, cntx ); + + // Extract the maximum mc, kc, and nc values specific to the current + // datatype. + mc_max_dt = bli_blksz_get_max( dt, mc ); + kc_max_dt = bli_blksz_get_max( dt, kc ); + nc_max_dt = bli_blksz_get_max( dt, nc ); + + // Add max(mr,nr) to kc to make room for the nudging of kc at + // runtime to be a multiple of mr or nr for triangular operations + // trmm, trmm3, and trsm. + kc_max_dt += max_mnr_dt; + + // + // Compute scaling factors. + // + + // Compute integer scaling factors (numerator and denominator) used + // to account for situations when the packing register blocksizes are + // larger than the regular register blocksizes. + + // In order to compute the scaling factors, we first have to determine + // whether ( packmr / mr ) is greater than ( packnr / nr ). This is + // needed ONLY because the amount of space allocated for a block of A + // and a panel of B needs to be such that MR and NR can be swapped (ie: + // A is packed with NR and B is packed with MR). This transformation is + // needed for right-side trsm when inducing an algorithm that (a) has + // favorable access patterns for column-stored C and (b) allows the + // macro-kernel to reuse the existing left-side fused gemmtrsm micro- + // kernels. We avoid integer division by cross-multiplying: + // + // ( packmr / mr ) >= ( packnr / nr ) + // ( packmr / mr ) * nr >= packnr + // packmr * nr >= packnr * mr + // + // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as + // our scaling factors. Otherwise, we'll use packnr and nr. + + packmr_dt = bli_blksz_get_max( dt, mr ); + packnr_dt = bli_blksz_get_max( dt, nr ); + + if ( packmr_dt * nr_dt >= + packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; + scale_den_dt = mr_dt; } + else { scale_num_dt = packnr_dt; + scale_den_dt = nr_dt; } + + // + // Compute pool block dimensions. + // + + pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; + left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; + + pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; + left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; + + pool_kc_dt = ( kc_max_dt ); + + if ( left_mc_dt > 0 ) pool_mc_dt += 1; + if ( left_nc_dt > 0 ) pool_nc_dt += 1; + + // + // Compute pool block sizes + // + + // We add an extra micro-panel of space to the block sizes for A and B + // just to be sure any pre-loading performed by the micro-kernel does + // not cause a segmentation fault. + max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); + + *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; + *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; + *bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt; +} diff --git a/frame/base/bli_membrk.h b/frame/base/bli_membrk.h new file mode 100644 index 000000000..5db956344 --- /dev/null +++ b/frame/base/bli_membrk.h @@ -0,0 +1,169 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MEMBRK_H +#define BLIS_MEMBRK_H + +// -- Memory broker object type -- + +typedef struct membrk_s +{ + pool_t pools[3]; + mtx_t mutex; + + malloc_ft malloc_fp; + free_ft free_fp; +} membrk_t; + +#define bli_membrk_pool( pool_index, membrk_p ) \ +\ + ( (membrk_p)->pools + (pool_index) ) + +#define bli_membrk_mutex( membrk_p ) \ +\ + ( &( (membrk_p)->mutex ) ) + +#define bli_membrk_malloc_fp( membrk_p ) \ +\ + ( (membrk_p)->malloc_fp ) + +#define bli_membrk_free_fp( membrk_p ) \ +\ + ( (membrk_p)->free_fp ) + +#define bli_membrk_set_malloc_fp( _malloc_fp, membrk_p ) \ +{\ + (membrk_p)->malloc_fp = _malloc_fp; \ +} + +#define bli_membrk_set_free_fp( _free_fp, membrk_p ) \ +{\ + (membrk_p)->free_fp = _free_fp; \ +} + +#define bli_membrk_lock( membrk_p ) \ +{\ + bli_mutex_lock( &((membrk_p)->mutex) ); \ +} + +#define bli_membrk_unlock( membrk_p ) \ +{\ + bli_mutex_unlock( &((membrk_p)->mutex) ); \ +} + +#define bli_membrk_malloc( size, membrk ) \ +\ + /* Call the malloc()-style function in membrk. */ \ + ((membrk)->malloc_fp)( size ) + +#define bli_membrk_free( buf_p, membrk ) \ +\ + /* Call the free()-style function in membrk. */ \ + ((membrk)->free_fp)( buf_p ) + + +// ----------------------------------------------------------------------------- + +void bli_membrk_init + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_finalize + ( + membrk_t* membrk + ); + +void bli_membrk_acquire_m + ( + membrk_t* membrk, + siz_t req_size, + packbuf_t buf_type, + mem_t* mem + ); + +void bli_membrk_acquire_v + ( + membrk_t* membrk, + siz_t req_size, + mem_t* mem + ); + +void bli_membrk_release + ( + mem_t* mem + ); + +siz_t bli_membrk_pool_size + ( + membrk_t* membrk, + packbuf_t buf_type + ); + +// ---------------------------------------------------------------------------- + +void bli_membrk_init_pools + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_reinit_pools + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_finalize_pools + ( + membrk_t* membrk + ); + +void bli_membrk_compute_pool_block_sizes + ( + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ); +void bli_membrk_compute_pool_block_sizes_dt + ( + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ); + +#endif + diff --git a/frame/include/bli_mem_macro_defs.h b/frame/include/bli_mem_macro_defs.h index 51840b712..d0fe850cd 100644 --- a/frame/include/bli_mem_macro_defs.h +++ b/frame/include/bli_mem_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -58,6 +59,10 @@ \ ( (mem_p)->pool ) +#define bli_mem_membrk( mem_p ) \ +\ + ( (mem_p)->membrk ) + #define bli_mem_size( mem_p ) \ \ ( (mem_p)->size ) @@ -90,12 +95,17 @@ #define bli_mem_set_buf_type( buf_type0, mem_p ) \ { \ - mem_p->buf_type = buf_type0; \ + (mem_p)->buf_type = buf_type0; \ } #define bli_mem_set_pool( pool0, mem_p ) \ { \ - mem_p->pool = pool0; \ + (mem_p)->pool = pool0; \ +} + +#define bli_mem_set_membrk( membrk0, mem_p ) \ +{ \ + (mem_p)->membrk = membrk0; \ } #define bli_mem_set_size( size0, mem_p ) \ @@ -109,6 +119,7 @@ bli_mem_set_buf_sys( NULL, mem_p ); \ bli_mem_set_pool( NULL, mem_p ); \ bli_mem_set_size( 0, mem_p ); \ + bli_mem_set_membrk( NULL, mem_p ); \ } diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 30c72e735..306c09544 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -957,14 +958,14 @@ bli_obj_width_stored( obj ) } -// Release object's pack (and cast) memory entries back to memory manager +// Release object's pack mem_t entries back to memory manager #define bli_obj_release_pack( obj_p ) \ { \ mem_t* pack_mem_ = bli_obj_pack_mem( *(obj_p) ); \ \ if ( bli_mem_is_alloc( pack_mem_ ) ) \ - bli_mem_release( pack_mem_ ); \ + bli_membrk_release( pack_mem_ ); \ } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 7274ce5a6..5f52c89b7 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -173,7 +174,6 @@ typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; - // // -- BLIS info bit field offsets ---------------------------------------------- // @@ -505,6 +505,10 @@ typedef enum // -- BLIS misc. structure types ----------------------------------------------- // +// -- Mutex type -- + +typedef struct mtx_s mtx_t; + // -- Pool block type -- typedef struct @@ -527,6 +531,19 @@ typedef struct siz_t align_size; } pool_t; +// -- Memory broker object type -- + +typedef struct membrk_s membrk_t; +/* +{ + pool_t pools[3]; + mtx_t mutex; + + malloc_ft malloc_fp; + free_ft free_fp; +} membrk_t; +*/ + // -- Memory object type -- typedef struct mem_s @@ -534,6 +551,7 @@ typedef struct mem_s pblk_t pblk; packbuf_t buf_type; pool_t* pool; + membrk_t* membrk; siz_t size; } mem_t; @@ -910,6 +928,7 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + membrk_t* membrk; } cntx_t; diff --git a/frame/include/blis.h b/frame/include/blis.h index 85d7a176d..32fca0c71 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -103,6 +104,7 @@ extern "C" { #include "bli_cntx.h" #include "bli_gks.h" #include "bli_ind.h" +#include "bli_membrk.h" #include "bli_pool.h" #include "bli_mem.h" #include "bli_part.h" diff --git a/frame/thread/bli_mutex.h b/frame/thread/bli_mutex.h new file mode 100644 index 000000000..5ccfebe63 --- /dev/null +++ b/frame/thread/bli_mutex.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_H +#define BLIS_MUTEX_H + +// Include definitions (mostly mtx_t) specific to the method of +// multithreading. +#include "bli_mutex_single.h" +#include "bli_mutex_openmp.h" +#include "bli_mutex_pthreads.h" + +// Thread mutex prototypes. + + +#endif + diff --git a/frame/thread/bli_mutex_openmp.h b/frame/thread/bli_mutex_openmp.h new file mode 100644 index 000000000..4aa82f8ae --- /dev/null +++ b/frame/thread/bli_mutex_openmp.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_OPENMP_H +#define BLIS_MUTEX_OPENMP_H + +// Define mutex_t for situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#include + +// Define mtx_t. +typedef struct mtx_s +{ + omp_lock_t mutex; +} mtx_t; + +// Define macros to operate on OpenMP-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ + omp_init_lock( mtx_p ); \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ + omp_destroy_lock( mtx_p ); \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ + omp_set_lock( mtx_p ); \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ + omp_unset_lock( mtx_p ); \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_mutex_pthreads.h b/frame/thread/bli_mutex_pthreads.h new file mode 100644 index 000000000..0ab1876b3 --- /dev/null +++ b/frame/thread/bli_mutex_pthreads.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_PTHREADS_H +#define BLIS_MUTEX_PTHREADS_H + +// Define mutex_t for situations when POSIX multithreading is enabled. +#ifdef BLIS_ENABLE_PTHREADS + +#include + +// Define mtx_t. +typedef struct mtx_s +{ + pthread_mutex_t mutex; +} mtx_t; + +// Define macros to operate on pthread-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ + pthread_mutex_init( mtx_p ); \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ + pthread_mutex_destroy( mtx_p ); \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ + pthread_mutex_lock( mtx_p ); \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ + pthread_mutex_unlock( mtx_p ); \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_mutex_single.h b/frame/thread/bli_mutex_single.h new file mode 100644 index 000000000..26aefcc21 --- /dev/null +++ b/frame/thread/bli_mutex_single.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_SINGLE_H +#define BLIS_MUTEX_SINGLE_H + +// Define mtx_t for situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +// Define mtx_t. +typedef struct mtx_s +{ +} mtx_t; + +// Define macros to operate on pthread-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 6ef2ebb1a..2498baf8c 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -51,6 +52,9 @@ #define BLIS_ENABLE_MULTITHREADING #endif +// Include thread mutex (mtx_t) object definitions and prototypes. +#include "bli_mutex.h" + // Include thread communicator (thrcomm_t) object definitions and prototypes. #include "bli_thrcomm.h" From c31b1e7b9d659b96433a87e5aecb90e457a104cc Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 27 Jul 2016 15:58:07 -0500 Subject: [PATCH 03/27] Relax alignment restrictions for sandybridge ukrs. Details: - Relaxed the base pointer and leading dimension alignment restrictions in the sandybridge gemm microkernels, allowing the use of vmovups/vmovupd instead of vmovaps/vmovapd. These change mimic those made to the haswell microkernels in e0d2fa0 and ee2c139. - Updated testsuite modules as well as standalone test drivers in 'test' directory to use DBL_MAX as the initial time candidate. Thanks to Devin Matthews for suggesting this change. - Inserted #include "float.h" into bli_system.h (to gain access to DBL_MAX). - Minor update (vis-a-vis contexts) to driver code in test/3m4m. --- frame/include/bli_system.h | 1 + kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 12 +- .../x86_64/sandybridge/3/bli_gemm_asm_d8x4.c | 333 +++++++----------- test/3m4m/test_gemm.c | 11 +- test/test_gemm.c | 2 +- test/test_gemv.c | 2 +- test/test_ger.c | 2 +- test/test_hemm.c | 2 +- test/test_hemv.c | 2 +- test/test_her.c | 2 +- test/test_her2.c | 2 +- test/test_her2k.c | 2 +- test/test_herk.c | 2 +- test/test_trmm.c | 2 +- test/test_trmv.c | 2 +- test/test_trsm.c | 2 +- test/test_trsv.c | 2 +- testsuite/src/test_addm.c | 2 +- testsuite/src/test_addv.c | 2 +- testsuite/src/test_axpbyv.c | 2 +- testsuite/src/test_axpy2v.c | 2 +- testsuite/src/test_axpyf.c | 2 +- testsuite/src/test_axpym.c | 2 +- testsuite/src/test_axpyv.c | 2 +- testsuite/src/test_copym.c | 2 +- testsuite/src/test_copyv.c | 2 +- testsuite/src/test_dotaxpyv.c | 2 +- testsuite/src/test_dotv.c | 2 +- testsuite/src/test_dotxaxpyf.c | 2 +- testsuite/src/test_dotxf.c | 2 +- testsuite/src/test_dotxv.c | 2 +- testsuite/src/test_gemm.c | 2 +- testsuite/src/test_gemm_ukr.c | 2 +- testsuite/src/test_gemmtrsm_ukr.c | 2 +- testsuite/src/test_gemv.c | 2 +- testsuite/src/test_ger.c | 2 +- testsuite/src/test_hemm.c | 2 +- testsuite/src/test_hemv.c | 2 +- testsuite/src/test_her.c | 2 +- testsuite/src/test_her2.c | 2 +- testsuite/src/test_her2k.c | 2 +- testsuite/src/test_herk.c | 2 +- testsuite/src/test_normfm.c | 2 +- testsuite/src/test_normfv.c | 2 +- testsuite/src/test_randm.c | 2 +- testsuite/src/test_randv.c | 2 +- testsuite/src/test_scal2m.c | 2 +- testsuite/src/test_scal2v.c | 2 +- testsuite/src/test_scalm.c | 2 +- testsuite/src/test_scalv.c | 2 +- testsuite/src/test_setm.c | 2 +- testsuite/src/test_setv.c | 2 +- testsuite/src/test_subm.c | 2 +- testsuite/src/test_subv.c | 2 +- testsuite/src/test_symm.c | 2 +- testsuite/src/test_symv.c | 2 +- testsuite/src/test_syr.c | 2 +- testsuite/src/test_syr2.c | 2 +- testsuite/src/test_syr2k.c | 2 +- testsuite/src/test_syrk.c | 2 +- testsuite/src/test_trmm.c | 2 +- testsuite/src/test_trmm3.c | 2 +- testsuite/src/test_trmv.c | 2 +- testsuite/src/test_trsm.c | 2 +- testsuite/src/test_trsm_ukr.c | 2 +- testsuite/src/test_trsv.c | 2 +- testsuite/src/test_xpbyv.c | 2 +- 67 files changed, 200 insertions(+), 283 deletions(-) diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 57fe810fc..05139136b 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -40,6 +40,7 @@ #include #include #include +#include // Determine if we are on a 64-bit or 32-bit architecture #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bc06c819b..cb6097fe2 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -329,7 +329,7 @@ void bli_sgemm_asm_6x16 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" @@ -474,8 +474,8 @@ void bli_sgemm_asm_6x16 " \n\t" " \n\t" ".SBETAZERO: \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" @@ -910,7 +910,7 @@ void bli_dgemm_asm_6x8 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" @@ -1053,8 +1053,8 @@ void bli_dgemm_asm_6x8 " \n\t" " \n\t" ".DBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" diff --git a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c index 0b017fbcd..f8db398ca 100644 --- a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c +++ b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c @@ -414,23 +414,6 @@ void bli_sgemm_asm_8x8 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 4*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -438,10 +421,8 @@ void bli_sgemm_asm_8x8 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORED \n\t" // jump to column storage case + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -740,52 +721,52 @@ void bli_sgemm_asm_8x8 ".SCOLSTORED: \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70, + "vmovups (%%rcx), %%ymm0 \n\t" // load c00:c70, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c01:c71, + "vmovups (%%rcx), %%ymm1 \n\t" // load c01:c71, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72, + "vmovups (%%rcx), %%ymm0 \n\t" // load c02:c72, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c03:c73, + "vmovups (%%rcx), %%ymm1 \n\t" // load c03:c73, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c04:c74, + "vmovups (%%rcx), %%ymm0 \n\t" // load c04:c74, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c05:c75, + "vmovups (%%rcx), %%ymm1 \n\t" // load c05:c75, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c06:c76, + "vmovups (%%rcx), %%ymm0 \n\t" // load c06:c76, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c07:c77, + "vmovups (%%rcx), %%ymm1 \n\t" // load c07:c77, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. @@ -794,17 +775,16 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" ".SBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".SGENSTORBZ: \n\t" " \n\t" " \n\t" // update c00:c70 - "vmovapd %%ymm15, %%ymm0 \n\t" + "vmovups %%ymm15, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -826,7 +806,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c01:c71 - "vmovapd %%ymm14, %%ymm0 \n\t" + "vmovups %%ymm14, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -848,7 +828,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c02:c72 - "vmovapd %%ymm13, %%ymm0 \n\t" + "vmovups %%ymm13, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -870,7 +850,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c03:c73 - "vmovapd %%ymm12, %%ymm0 \n\t" + "vmovups %%ymm12, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -892,7 +872,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c04:c74 - "vmovapd %%ymm11, %%ymm0 \n\t" + "vmovups %%ymm11, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -914,7 +894,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c05:c75 - "vmovapd %%ymm10, %%ymm0 \n\t" + "vmovups %%ymm10, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -936,7 +916,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c06:c76 - "vmovapd %%ymm9, %%ymm0 \n\t" + "vmovups %%ymm9, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -958,7 +938,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c07:c77 - "vmovapd %%ymm8, %%ymm0 \n\t" + "vmovups %%ymm8, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -983,28 +963,28 @@ void bli_sgemm_asm_8x8 ".SCOLSTORBZ: \n\t" " \n\t" " \n\t" - "vmovaps %%ymm15, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm15, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm14, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm14, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm13, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm13, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm12, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm12, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm11, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm11, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm10, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm10, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm9, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm9, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm8, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm8, (%%rcx) \n\t" // and store back to memory. " \n\t" " \n\t" " \n\t" @@ -1378,23 +1358,6 @@ void bli_dgemm_asm_8x4 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -1402,10 +1365,8 @@ void bli_dgemm_asm_8x4 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORED \n\t" // jump to column storage case + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -1540,53 +1501,53 @@ void bli_dgemm_asm_8x4 ".DCOLSTORED: \n\t" " \n\t" // update c00:c33 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c00:c30, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c00:c30, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c01:c31, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c01:c31, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c02:c32, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c02:c32, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c03:c33, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c03:c33, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. " \n\t" " \n\t" // update c40:c73 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c40:c70, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c40:c70, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c41:c71, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c41:c71, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c42:c72, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c42:c72, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c43:c73, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c43:c73, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. @@ -1595,10 +1556,9 @@ void bli_dgemm_asm_8x4 " \n\t" " \n\t" ".DBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -1669,29 +1629,29 @@ void bli_dgemm_asm_8x4 ".DCOLSTORBZ: \n\t" " \n\t" // update c00:c33 " \n\t" - "vmovapd %%ymm9, (%%rcx) \n\t" // store c00:c30 + "vmovupd %%ymm9, (%%rcx) \n\t" // store c00:c30 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm11, (%%rcx) \n\t" // store c01:c31 + "vmovupd %%ymm11, (%%rcx) \n\t" // store c01:c31 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" // store c02:c32 + "vmovupd %%ymm13, (%%rcx) \n\t" // store c02:c32 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm15, (%%rcx) \n\t" // store c03:c33 + "vmovupd %%ymm15, (%%rcx) \n\t" // store c03:c33 " \n\t" " \n\t" // update c40:c73 " \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" // store c40:c70 + "vmovupd %%ymm8, (%%rdx) \n\t" // store c40:c70 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm10, (%%rdx) \n\t" // store c41:c71 + "vmovupd %%ymm10, (%%rdx) \n\t" // store c41:c71 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm12, (%%rdx) \n\t" // store c42:c72 + "vmovupd %%ymm12, (%%rdx) \n\t" // store c42:c72 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" // store c43:c73 + "vmovupd %%ymm14, (%%rdx) \n\t" // store c43:c73 " \n\t" " \n\t" " \n\t" @@ -2260,23 +2220,6 @@ void bli_cgemm_asm_8x4 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -2288,10 +2231,8 @@ void bli_cgemm_asm_8x4 "jne .CBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORED \n\t" // jump to column storage case + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -2459,90 +2400,90 @@ void bli_cgemm_asm_8x4 " \n\t" " \n\t" // update c00:c70 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c00:c70 + "vmovups %%ymm0, (%%rcx) \n\t" // store c00:c70 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c80:cf0 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c80:cf0 + "vmovups %%ymm0, (%%rdx) \n\t" // store c80:cf0 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c00:c70 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c01:c71 + "vmovups %%ymm0, (%%rcx) \n\t" // store c01:c71 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c81:cf1 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c81:cf1 + "vmovups %%ymm0, (%%rdx) \n\t" // store c81:cf1 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c02:c72 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c02:c72 + "vmovups %%ymm0, (%%rcx) \n\t" // store c02:c72 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c82:cf2 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c82:cf2 + "vmovups %%ymm0, (%%rdx) \n\t" // store c82:cf2 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c03:c73 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c03:c73 + "vmovups %%ymm0, (%%rcx) \n\t" // store c03:c73 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c83:cf3 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c83:cf3 + "vmovups %%ymm0, (%%rdx) \n\t" // store c83:cf3 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" @@ -2552,11 +2493,9 @@ void bli_cgemm_asm_8x4 " \n\t" " \n\t" ".CBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -2643,28 +2582,28 @@ void bli_cgemm_asm_8x4 ".CCOLSTORBZ: \n\t" " \n\t" " \n\t" - "vmovaps %%ymm15, (%%rcx) \n\t" // store c00:c70 + "vmovups %%ymm15, (%%rcx) \n\t" // store c00:c70 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm14, (%%rdx) \n\t" // store c80:cf0 + "vmovups %%ymm14, (%%rdx) \n\t" // store c80:cf0 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm13, (%%rcx) \n\t" // store c01:c71 + "vmovups %%ymm13, (%%rcx) \n\t" // store c01:c71 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm12, (%%rdx) \n\t" // store c81:cf1 + "vmovups %%ymm12, (%%rdx) \n\t" // store c81:cf1 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm11, (%%rcx) \n\t" // store c02:c72 + "vmovups %%ymm11, (%%rcx) \n\t" // store c02:c72 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm10, (%%rdx) \n\t" // store c82:cf2 + "vmovups %%ymm10, (%%rdx) \n\t" // store c82:cf2 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm9, (%%rcx) \n\t" // store c03:c73 + "vmovups %%ymm9, (%%rcx) \n\t" // store c03:c73 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm8, (%%rdx) \n\t" // store c83:cf3 + "vmovups %%ymm8, (%%rdx) \n\t" // store c83:cf3 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" @@ -3178,26 +3117,6 @@ void bli_zgemm_asm_4x4 "leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 16*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (16*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -3209,10 +3128,8 @@ void bli_zgemm_asm_4x4 "jne .ZBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORED \n\t" // jump to column storage case + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -3345,90 +3262,90 @@ void bli_zgemm_asm_4x4 ".ZCOLSTORED: \n\t" " \n\t" // update c00:c30 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c00:c30 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c00:c30 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c00:c30 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c00:c30 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c40:c70 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c40:c70 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c40:c70 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c40:c70 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c40:c70 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c01:c31 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c01:c31 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c01:c31 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c01:c31 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c01:c31 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c41:c71 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c41:c71 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c41:c71 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c41:c71 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c41:c71 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c02:c32 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c02:c32 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c02:c32 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c02:c32 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c02:c32 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c42:c72 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c42:c72 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c42:c72 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c42:c72 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c42:c72 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c03:c33 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c03:c33 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c03:c33 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c03:c33 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c03:c33 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c43:c73 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c43:c73 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c43:c73 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c43:c73 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c43:c73 " \n\t" " \n\t" " \n\t" @@ -3437,11 +3354,9 @@ void bli_zgemm_asm_4x4 " \n\t" " \n\t" ".ZBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -3510,28 +3425,28 @@ void bli_zgemm_asm_4x4 ".ZCOLSTORBZ: \n\t" " \n\t" " \n\t" - "vmovapd %%ymm15, (%%rcx) \n\t" // store c00:c30 + "vmovupd %%ymm15, (%%rcx) \n\t" // store c00:c30 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" // store c40:c70 + "vmovupd %%ymm14, (%%rdx) \n\t" // store c40:c70 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" // store c01:c31 + "vmovupd %%ymm13, (%%rcx) \n\t" // store c01:c31 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm12, (%%rdx) \n\t" // store c41:c71 + "vmovupd %%ymm12, (%%rdx) \n\t" // store c41:c71 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm11, (%%rcx) \n\t" // store c02:c32 + "vmovupd %%ymm11, (%%rcx) \n\t" // store c02:c32 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm10, (%%rdx) \n\t" // store c42:c72 + "vmovupd %%ymm10, (%%rdx) \n\t" // store c42:c72 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm9, (%%rcx) \n\t" // store c03:c33 + "vmovupd %%ymm9, (%%rcx) \n\t" // store c03:c33 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" // store c43:c73 + "vmovupd %%ymm8, (%%rdx) \n\t" // store c43:c73 " \n\t" " \n\t" " \n\t" diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index 0a45266d9..c8e9ec5d5 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -79,14 +79,15 @@ int main( int argc, char** argv ) k_input = -1; #if 0 - extern blksz_t* gemm_kc; + num_t dt_real = bli_datatype_proj_to_real( DT ); + cntx_t cntx; - num_t dt_real = bli_datatype_proj_to_real( DT ); + bli_gemm_cntx_init( &cntx ); // Extract the kc blocksize for the requested datatype and its // real analogue. - dim_t kc = bli_blksz_get_def( dt, gemm_kc ); - dim_t kc_real = bli_blksz_get_def( dt_real, gemm_kc ); + dim_t kc = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); + dim_t kc_real = bli_cntx_get_blksz_def_dt( dt_real, BLIS_KC, &cntx ); // Assign the k dimension depending on which implementation is // being tested. Note that the BLIS_NAT case handles the real @@ -163,7 +164,7 @@ int main( int argc, char** argv ) bli_ind_enable_dt( IND, dt ); #endif - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_gemm.c b/test/test_gemm.c index 7d5ac6a9c..dd46b5237 100644 --- a/test/test_gemm.c +++ b/test/test_gemm.c @@ -127,7 +127,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_gemv.c b/test/test_gemv.c index 14aa3b87b..cab20d0de 100644 --- a/test/test_gemv.c +++ b/test/test_gemv.c @@ -107,7 +107,7 @@ int main( int argc, char** argv ) bli_copym( &y, &y_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_ger.c b/test/test_ger.c index a56da98d1..8564f1cfd 100644 --- a/test/test_ger.c +++ b/test/test_ger.c @@ -105,7 +105,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_hemm.c b/test/test_hemm.c index 3844f5623..03969893e 100644 --- a/test/test_hemm.c +++ b/test/test_hemm.c @@ -146,7 +146,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_hemv.c b/test/test_hemv.c index 81593e9f0..ab7ae941e 100644 --- a/test/test_hemv.c +++ b/test/test_hemv.c @@ -114,7 +114,7 @@ int main( int argc, char** argv ) bli_copym( &y, &y_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_her.c b/test/test_her.c index a3df5faa1..44cb91b6e 100644 --- a/test/test_her.c +++ b/test/test_her.c @@ -111,7 +111,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_her2.c b/test/test_her2.c index 1ee954c07..7bb27b106 100644 --- a/test/test_her2.c +++ b/test/test_her2.c @@ -112,7 +112,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_her2k.c b/test/test_her2k.c index 3f1de8bbf..0204051a1 100644 --- a/test/test_her2k.c +++ b/test/test_her2k.c @@ -137,7 +137,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_herk.c b/test/test_herk.c index bbad4e5d4..a3ac93adb 100644 --- a/test/test_herk.c +++ b/test/test_herk.c @@ -129,7 +129,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trmm.c b/test/test_trmm.c index f75855923..e72028153 100644 --- a/test/test_trmm.c +++ b/test/test_trmm.c @@ -144,7 +144,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trmv.c b/test/test_trmv.c index 2fdb49fdc..6eb089f2a 100644 --- a/test/test_trmv.c +++ b/test/test_trmv.c @@ -108,7 +108,7 @@ int main( int argc, char** argv ) bli_copym( &x, &x_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trsm.c b/test/test_trsm.c index ba8cf3bb6..f23e4de12 100644 --- a/test/test_trsm.c +++ b/test/test_trsm.c @@ -144,7 +144,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trsv.c b/test/test_trsv.c index 1a8777aca..c61edaf03 100644 --- a/test/test_trsv.c +++ b/test/test_trsv.c @@ -107,7 +107,7 @@ int main( int argc, char** argv ) bli_copym( &x, &x_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c index 92dbca677..fe0f3172a 100644 --- a/testsuite/src/test_addm.c +++ b/testsuite/src/test_addm.c @@ -142,7 +142,7 @@ void libblis_test_addm_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_addv.c b/testsuite/src/test_addv.c index 3e8225892..36067b7fc 100644 --- a/testsuite/src/test_addv.c +++ b/testsuite/src/test_addv.c @@ -141,7 +141,7 @@ void libblis_test_addv_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_axpbyv.c b/testsuite/src/test_axpbyv.c index a3b030784..ff05a0b42 100644 --- a/testsuite/src/test_axpbyv.c +++ b/testsuite/src/test_axpbyv.c @@ -155,7 +155,7 @@ void libblis_test_axpbyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index 7a67c71a8..6f5515127 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -155,7 +155,7 @@ void libblis_test_axpy2v_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 3a67f18b2..706359ca4 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -153,7 +153,7 @@ void libblis_test_axpyf_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, b_n; diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c index ae8903fab..896373ed1 100644 --- a/testsuite/src/test_axpym.c +++ b/testsuite/src/test_axpym.c @@ -150,7 +150,7 @@ void libblis_test_axpym_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c index 779c40ac5..472798b85 100644 --- a/testsuite/src/test_axpyv.c +++ b/testsuite/src/test_axpyv.c @@ -150,7 +150,7 @@ void libblis_test_axpyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_copym.c b/testsuite/src/test_copym.c index a2c023a4c..6993fd302 100644 --- a/testsuite/src/test_copym.c +++ b/testsuite/src/test_copym.c @@ -141,7 +141,7 @@ void libblis_test_copym_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_copyv.c b/testsuite/src/test_copyv.c index 13fd6c01c..5029227d6 100644 --- a/testsuite/src/test_copyv.c +++ b/testsuite/src/test_copyv.c @@ -141,7 +141,7 @@ void libblis_test_copyv_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index bf573f71a..36b88cc2f 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -157,7 +157,7 @@ void libblis_test_dotaxpyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c index 1fa11fcfb..ece73cdb2 100644 --- a/testsuite/src/test_dotv.c +++ b/testsuite/src/test_dotv.c @@ -146,7 +146,7 @@ void libblis_test_dotv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index 81f78d94b..dd83dc49e 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -163,7 +163,7 @@ void libblis_test_dotxaxpyf_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, b_n; diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index 95009c4ee..3a29b41b7 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -155,7 +155,7 @@ void libblis_test_dotxf_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, b_n; diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c index c0ed77b55..e394cf0ac 100644 --- a/testsuite/src/test_dotxv.c +++ b/testsuite/src/test_dotxv.c @@ -151,7 +151,7 @@ void libblis_test_dotxv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index aaa9bc408..222dca395 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -157,7 +157,7 @@ void libblis_test_gemm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n, k; diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index d1877d6d6..0bb3c4440 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -158,7 +158,7 @@ void libblis_test_gemm_ukr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n, k; diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 5f0babc07..c74d47d60 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -177,7 +177,7 @@ void libblis_test_gemmtrsm_ukr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n, k; diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c index 71427932c..b254a861c 100644 --- a/testsuite/src/test_gemv.c +++ b/testsuite/src/test_gemv.c @@ -154,7 +154,7 @@ void libblis_test_gemv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c index 4a23a02c0..fc7944f52 100644 --- a/testsuite/src/test_ger.c +++ b/testsuite/src/test_ger.c @@ -152,7 +152,7 @@ void libblis_test_ger_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c index 5f291a9c1..1b4231ba8 100644 --- a/testsuite/src/test_hemm.c +++ b/testsuite/src/test_hemm.c @@ -160,7 +160,7 @@ void libblis_test_hemm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c index f786e32e4..6ab6fa11f 100644 --- a/testsuite/src/test_hemv.c +++ b/testsuite/src/test_hemv.c @@ -155,7 +155,7 @@ void libblis_test_hemv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c index 5199b5715..37ec26c1d 100644 --- a/testsuite/src/test_her.c +++ b/testsuite/src/test_her.c @@ -152,7 +152,7 @@ void libblis_test_her_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c index 0778ce514..d3660d7c2 100644 --- a/testsuite/src/test_her2.c +++ b/testsuite/src/test_her2.c @@ -154,7 +154,7 @@ void libblis_test_her2_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c index 1a1759f2c..95d0dbf72 100644 --- a/testsuite/src/test_her2k.c +++ b/testsuite/src/test_her2k.c @@ -158,7 +158,7 @@ void libblis_test_her2k_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c index 39d16fbf3..37853efb7 100644 --- a/testsuite/src/test_herk.c +++ b/testsuite/src/test_herk.c @@ -156,7 +156,7 @@ void libblis_test_herk_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_normfm.c b/testsuite/src/test_normfm.c index 5e6b76c39..b0b4735ca 100644 --- a/testsuite/src/test_normfm.c +++ b/testsuite/src/test_normfm.c @@ -145,7 +145,7 @@ void libblis_test_normfm_experiment num_t dt_real = bli_datatype_proj_to_real( datatype ); - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_normfv.c b/testsuite/src/test_normfv.c index c10135516..a4de1f882 100644 --- a/testsuite/src/test_normfv.c +++ b/testsuite/src/test_normfv.c @@ -145,7 +145,7 @@ void libblis_test_normfv_experiment num_t dt_real = bli_datatype_proj_to_real( datatype ); - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c index e3defdad7..55e3920be 100644 --- a/testsuite/src/test_randm.c +++ b/testsuite/src/test_randm.c @@ -140,7 +140,7 @@ void libblis_test_randm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_randv.c b/testsuite/src/test_randv.c index e18a2ec13..776d4c647 100644 --- a/testsuite/src/test_randv.c +++ b/testsuite/src/test_randv.c @@ -140,7 +140,7 @@ void libblis_test_randv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c index 144f2e03c..8e1257f25 100644 --- a/testsuite/src/test_scal2m.c +++ b/testsuite/src/test_scal2m.c @@ -149,7 +149,7 @@ void libblis_test_scal2m_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c index 82f749230..9620754f2 100644 --- a/testsuite/src/test_scal2v.c +++ b/testsuite/src/test_scal2v.c @@ -149,7 +149,7 @@ void libblis_test_scal2v_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c index 76688ee9a..3d59e3bd0 100644 --- a/testsuite/src/test_scalm.c +++ b/testsuite/src/test_scalm.c @@ -145,7 +145,7 @@ void libblis_test_scalm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c index 90e9daf68..df10e33a9 100644 --- a/testsuite/src/test_scalv.c +++ b/testsuite/src/test_scalv.c @@ -146,7 +146,7 @@ void libblis_test_scalv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_setm.c b/testsuite/src/test_setm.c index 214c43fdf..a077baee3 100644 --- a/testsuite/src/test_setm.c +++ b/testsuite/src/test_setm.c @@ -142,7 +142,7 @@ void libblis_test_setm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_setv.c b/testsuite/src/test_setv.c index b587d7fb0..459eea6aa 100644 --- a/testsuite/src/test_setv.c +++ b/testsuite/src/test_setv.c @@ -142,7 +142,7 @@ void libblis_test_setv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c index 3cae1c0a2..8e98e7e6c 100644 --- a/testsuite/src/test_subm.c +++ b/testsuite/src/test_subm.c @@ -142,7 +142,7 @@ void libblis_test_subm_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_subv.c b/testsuite/src/test_subv.c index 7cafd2a4b..c9732ad94 100644 --- a/testsuite/src/test_subv.c +++ b/testsuite/src/test_subv.c @@ -142,7 +142,7 @@ void libblis_test_subv_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c index 992968568..13396d849 100644 --- a/testsuite/src/test_symm.c +++ b/testsuite/src/test_symm.c @@ -160,7 +160,7 @@ void libblis_test_symm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c index 89ecad953..6a6165a8d 100644 --- a/testsuite/src/test_symv.c +++ b/testsuite/src/test_symv.c @@ -155,7 +155,7 @@ void libblis_test_symv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c index d3e51b261..525460f91 100644 --- a/testsuite/src/test_syr.c +++ b/testsuite/src/test_syr.c @@ -152,7 +152,7 @@ void libblis_test_syr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c index 4ece061b0..33bf6b536 100644 --- a/testsuite/src/test_syr2.c +++ b/testsuite/src/test_syr2.c @@ -154,7 +154,7 @@ void libblis_test_syr2_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c index 6bae557e3..cdb4a185e 100644 --- a/testsuite/src/test_syr2k.c +++ b/testsuite/src/test_syr2k.c @@ -158,7 +158,7 @@ void libblis_test_syr2k_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c index afd73e7fa..e13da6543 100644 --- a/testsuite/src/test_syrk.c +++ b/testsuite/src/test_syrk.c @@ -156,7 +156,7 @@ void libblis_test_syrk_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index 81114caa2..4099806d3 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -156,7 +156,7 @@ void libblis_test_trmm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c index 96645e87c..7ce850282 100644 --- a/testsuite/src/test_trmm3.c +++ b/testsuite/src/test_trmm3.c @@ -160,7 +160,7 @@ void libblis_test_trmm3_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c index 38fc02b3b..d69224a4f 100644 --- a/testsuite/src/test_trmv.c +++ b/testsuite/src/test_trmv.c @@ -151,7 +151,7 @@ void libblis_test_trmv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c index cd0acac16..0fbc26860 100644 --- a/testsuite/src/test_trsm.c +++ b/testsuite/src/test_trsm.c @@ -156,7 +156,7 @@ void libblis_test_trsm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 1592fc847..200a6d1a8 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -160,7 +160,7 @@ void libblis_test_trsm_ukr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c index dc5e118bd..a9f243103 100644 --- a/testsuite/src/test_trsv.c +++ b/testsuite/src/test_trsv.c @@ -151,7 +151,7 @@ void libblis_test_trsv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_xpbyv.c b/testsuite/src/test_xpbyv.c index d490a565a..46f79c3ea 100644 --- a/testsuite/src/test_xpbyv.c +++ b/testsuite/src/test_xpbyv.c @@ -149,7 +149,7 @@ void libblis_test_xpbyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; From 16a4c7a823d60707ed9272f5d36e5c5d54c0ba4b Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 19 Aug 2016 11:38:36 -0500 Subject: [PATCH 04/27] Fixed bugs in bli_mutex_init() and friends. Details: - Fixed a couple of bugs that affected OpenMP and POSIX threads configurations that resulted in compiler errors and warnings due to type mismatch, and in the case of pthreads, a missing function argument. The bugs are fairly recent, introduced in a017062. --- frame/thread/bli_mutex_openmp.h | 8 ++++---- frame/thread/bli_mutex_pthreads.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/frame/thread/bli_mutex_openmp.h b/frame/thread/bli_mutex_openmp.h index 4aa82f8ae..cb13df5d3 100644 --- a/frame/thread/bli_mutex_openmp.h +++ b/frame/thread/bli_mutex_openmp.h @@ -50,20 +50,20 @@ typedef struct mtx_s // Define macros to operate on OpenMP-based mtx_t. #define bli_mutex_init( mtx_p ) \ { \ - omp_init_lock( mtx_p ); \ + omp_init_lock( &((mtx_p)->mutex) ); \ } #define bli_mutex_finalize( mtx_p ) \ { \ - omp_destroy_lock( mtx_p ); \ + omp_destroy_lock( &((mtx_p)->mutex) ); \ } #define bli_mutex_lock( mtx_p ) \ { \ - omp_set_lock( mtx_p ); \ + omp_set_lock( &((mtx_p)->mutex) ); \ } #define bli_mutex_unlock( mtx_p ) \ { \ - omp_unset_lock( mtx_p ); \ + omp_unset_lock( &((mtx_p)->mutex) ); \ } #endif diff --git a/frame/thread/bli_mutex_pthreads.h b/frame/thread/bli_mutex_pthreads.h index 0ab1876b3..328f9fd6b 100644 --- a/frame/thread/bli_mutex_pthreads.h +++ b/frame/thread/bli_mutex_pthreads.h @@ -50,20 +50,20 @@ typedef struct mtx_s // Define macros to operate on pthread-based mtx_t. #define bli_mutex_init( mtx_p ) \ { \ - pthread_mutex_init( mtx_p ); \ + pthread_mutex_init( &((mtx_p)->mutex), NULL ); \ } #define bli_mutex_finalize( mtx_p ) \ { \ - pthread_mutex_destroy( mtx_p ); \ + pthread_mutex_destroy( &((mtx_p)->mutex) ); \ } #define bli_mutex_lock( mtx_p ) \ { \ - pthread_mutex_lock( mtx_p ); \ + pthread_mutex_lock( &((mtx_p)->mutex) ); \ } #define bli_mutex_unlock( mtx_p ) \ { \ - pthread_mutex_unlock( mtx_p ); \ + pthread_mutex_unlock( &((mtx_p)->mutex) ); \ } #endif From 50293da38d5f2b7be9bbc94b9e85aacb6a10f672 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 23 Aug 2016 13:38:36 -0500 Subject: [PATCH 05/27] Avoid compiling BLAS/CBLAS files when disabled. Details: - Updated the top-level Makefile, build/config.mk.in template, and configure script so that object files corresponding to source files belonging to the BLAS compatibility layer are not compiled (or archived) when the compatibility layer is disabled. (Same for CBLAS.) Thanks to Devin Matthews for suggesting this optimization. - Slight change to the way configure handles internal variables. Instead of converting (overwriting) some, such as enable_blas2blis and enable_cblas, from a "yes" or "no" to a "1" or "0" value, the latter are now stored in new variables that live alongside the originals (with the suffix "_01"). This is convenient since some values need to be sed-substituted into the config.mk.in template, which requires "yes" or "no", while some need to be written to the bli_config.h.in template, which requires "0" or "1". --- Makefile | 13 +++++++++++++ build/config.mk.in | 4 ++++ configure | 32 +++++++++++++++++++------------- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 5ac386fec..1a4868eaa 100644 --- a/Makefile +++ b/Makefile @@ -312,6 +312,19 @@ MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/% MK_ALL_BLIS_OBJS := $(MK_BLIS_CONFIG_OBJS) \ $(MK_BLIS_FRAME_OBJS) +# Optionally filter out the BLAS and CBLAS compatibility layer object files. +# This is not actually necessary, since each affected file is guarded by C +# preprocessor macros, but it but prevents "empty" object files from being +# added into the library (and reduces compilation time). +BASE_OBJ_BLAS_PATH := $(BASE_OBJ_FRAME_PATH)/compat +BASE_OBJ_CBLAS_PATH := $(BASE_OBJ_FRAME_PATH)/compat/cblas +ifeq ($(BLIS_ENABLE_CBLAS),no) +MK_ALL_BLIS_OBJS := $(filter-out $(BASE_OBJ_CBLAS_PATH)/%.o, $(MK_ALL_BLIS_OBJS) ) +endif +ifeq ($(BLIS_ENABLE_BLAS2BLIS),no) +MK_ALL_BLIS_OBJS := $(filter-out $(BASE_OBJ_BLAS_PATH)/%.o, $(MK_ALL_BLIS_OBJS) ) +endif + # diff --git a/build/config.mk.in b/build/config.mk.in index 8bdb427a0..9d92f7fb4 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -64,5 +64,9 @@ BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := @enable_verbose@ BLIS_ENABLE_STATIC_BUILD := @enable_static@ BLIS_ENABLE_DYNAMIC_BUILD := @enable_dynamic@ +# The status of BLAS and CBLAS compatibility layers +BLIS_ENABLE_BLAS2BLIS := @enable_blas2blis@ +BLIS_ENABLE_CBLAS := @enable_cblas@ + # end of ifndef CONFIG_MK_INCLUDED conditional block endif diff --git a/configure b/configure index 8af3bde66..e0dc82c89 100755 --- a/configure +++ b/configure @@ -486,16 +486,20 @@ main() # Check the threading model flag. - enable_openmp=0 - enable_pthreads=0 + enable_openmp='no' + enable_openmp_01=0 + enable_pthreads='no' + enable_pthreads_01=0 if [ "x${threading_model}" = "xauto" ]; then echo "${script_name}: determining the threading model automatically." elif [ "x${threading_model}" = "xomp" ]; then echo "${script_name}: using OpenMP for threading." - enable_openmp=1 + enable_openmp='yes' + enable_openmp_01=1 elif [ "x${threading_model}" = "xpthreads" ]; then echo "${script_name}: using Pthreads for threading." - enable_pthreads=1 + enable_pthreads='yes' + enable_pthreads_01=1 elif [ "x${threading_model}" = "xno" ]; then echo "${script_name}: threading is disabled." else @@ -507,19 +511,19 @@ main() # Convert 'yes' and 'no' flags to booleans. if [ "x${enable_cblas}" = "xyes" ]; then echo "${script_name}: the CBLAS compatibility layer is enabled." - enable_cblas=1 + enable_cblas_01=1 # Force BLAS layer when CBLAS is enabled - enable_blas='yes' + enable_blas2blis='yes' else echo "${script_name}: the CBLAS compatibility layer is disabled." - enable_cblas=0 + enable_cblas_01=0 fi if [ "x${enable_blas2blis}" = "xyes" ]; then echo "${script_name}: the BLAS compatibility layer is enabled." - enable_blas2blis=1 + enable_blas2blis_01=1 else echo "${script_name}: the BLAS compatibility layer is disabled." - enable_blas2blis=0 + enable_blas2blis_01=0 fi @@ -561,6 +565,8 @@ main() | sed "s/@enable_static@/${enable_static}/g" \ | sed "s/@enable_dynamic@/${enable_shared}/g" \ | sed "s/@threading_model@/${threading_model}/g" \ + | sed "s/@enable_blas2blis@/${enable_blas2blis}/g" \ + | sed "s/@enable_cblas@/${enable_cblas}/g" \ > "${config_mk_out_path}" @@ -568,12 +574,12 @@ main() # to bli_config_h_out. echo "${script_name}: creating ${bli_config_h_out_path} from ${bli_config_h_in_path}" cat "${bli_config_h_in_path}" \ - | sed "s/@enable_openmp@/${enable_openmp}/g" \ - | sed "s/@enable_pthreads@/${enable_pthreads}/g" \ + | sed "s/@enable_openmp@/${enable_openmp_01}/g" \ + | sed "s/@enable_pthreads@/${enable_pthreads_01}/g" \ | sed "s/@int_type_size@/${int_type_size}/g" \ | sed "s/@blas2blis_int_type_size@/${blas2blis_int_type_size}/g" \ - | sed "s/@enable_blas2blis@/${enable_blas2blis}/g" \ - | sed "s/@enable_cblas@/${enable_cblas}/g" \ + | sed "s/@enable_blas2blis@/${enable_blas2blis_01}/g" \ + | sed "s/@enable_cblas@/${enable_cblas_01}/g" \ > "${bli_config_h_out_path}" From 701b9aa3ff028decbf90efac0dca5bd64fe26269 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 26 Aug 2016 19:04:45 -0500 Subject: [PATCH 06/27] Redesigned control tree infrastructure. Details: - Altered control tree node struct definitions so that all nodes have the same struct definition, whose primary fields consist of a blocksize id, a variant function pointer, a pointer to an optional parameter struct, and a pointer to a (single) sub-node. This unified control tree type is now named cntl_t. - Changed the way control tree nodes are connected, and what computation they represent, such that, for example, packing operations are now associated with nodes that are "inline" in the tree, rather than off- shoot braches. The original tree for the classic Goto gemm algorithm was expressed (roughly) as: blk_var2 -> blk_var3 -> blk_var1 -> ker_var2 | | -> packb -> packa and now, the same tree would look like: blk_var2 -> blk_var3 -> packb -> blk_var1 -> packa -> ker_var2 Specifically, the packb and packa nodes perform their respective packing operations and then recurse (without any loop) to a subproblem. This means there are now two kinds of level-3 control tree nodes: partitioning and non-partitioning. The blocked variants are members of the former, because they iteratively partition off submatrices and perform suboperations on those partitions, while the packing variants belong to the latter group. (This change has the effect of allowing greatly simplified initialization of the nodes, which previously involved setting many unused node fields to NULL.) - Changed the way thrinfo_t tree nodes are arranged to mirror the new connective structure of control trees. That is, packm nodes are no longer off-shoot branches of the main algorithmic nodes, but rather connected "inline". - Simplified control tree creation functions. Partitioning nodes are created concisely with just a few fields needing initialization. By contrast, the packing nodes require additional parameters, which are stored in a packm-specific struct that is tracked via the optional parameters pointer within the control tree struct. (This parameter struct must always begin with a uint64_t that contains the byte size of the struct. This allows us to use a generic function to recursively copy control trees.) gemm, herk, and trmm control tree creation continues to be consolidated into a single function, with the operation family being used to select among the parameter-agnostic macro-kernel wrappers. A single routine, bli_cntl_free(), is provided to free control trees recursively, whereby the chief thread within a groups release the blocks associated with mem_t entries back to the memory broker from which they were acquired. - Updated internal back-ends, e.g. bli_gemm_int(), to query and call the function pointer stored in the current control tree node (rather than index into a local function pointer array). Before being invoked, these function pointers are first cast to a gemm_voft (for gemm, herk, or trmm families) or trsm_voft (for trsm family) type, which is defined in frame/3/bli_l3_var_oft.h. - Retired herk and trmm internal back-ends, since all execution now flows through gemm or trsm blocked variants. - Merged forwards- and backwards-moving variants by querying the direction from routines as a function of the variant's matrix operands. gemm and herk always move forward, while trmm and trsm move in a direction that is dependent on which operand (a or b) is triangular. - Added functions bli_thread_get_range_mdim(), bli_thread_get_range_ndim(), each of which takes additional arguments and hides complexity in managing the difference between the way ranges are computed for the four families of operations. - Simplified level-3 blocked variants according to the above changes, so that the only steps taken are: 1. Query partitioning direction (forwards or backwards). 2. Prune unreferenced regions, if they exist. 3. Determine the thread partitioning sub-ranges. 4. Determine the partitioning blocksize (passing in the partitioning direction) 5. Acquire the curren iteration's partitions for the matrices affected by the current variants's partitioning dimension (m, k, n). 6. Call the subproblem. - Instantiate control trees once per thread, per operation invocation. (This is a change from the previous regime in which control trees were treated as stateless objects, initialized with the library, and shared as read-only objects between threads.) This once-per-thread allocation is done primarily to allow threads to use the control tree as as place to cache certain data for use in subsequent loop iterations. Presently, the only application of this caching is a mem_t entry for the packing blocks checked out from the memory broker (allocator). If a non-NULL control tree is passed in by the (expert) user, then the tree is copied by each thread. This is done in bli_l3_thread_decorator(), in bli_thrcomm_*.c. - Added a new field to the context, and opid_t which tracks the "family" of the operation being executed. For example, gemm, hemm, and symm are all part of the gemm family, while herk, syrk, her2k, and syr2k are all part of the herk family. Knowing the operation's family is necessary when conditionally executing the internal (beta) scalar reset on on C in blocked variant 3, which is needed for gemm and herk families, but must not be performed for the trmm family (because beta has only been applied to the current row-panel of C after the first rank-kc iteration). - Reexpressed 3m3 induced method blocked variant in frame/3/gemm/ind to comform with the new control tree design, and renamed the macro- kernel codes corresponding to 3m2 and 4m1b. - Renamed bli_mem.c (and its APIs) to bli_memsys.c, and renamed/relocated bli_mem_macro_defs.h from frame/include to frame/base/bli_mem.h. - Renamed/relocated bli_auxinfo_macro_defs.h from frame/include to frame/base/bli_auxinfo.h. - Fixed a minor bug whereby the storage-to-ukr-preference matching optimization in the various level-3 front-ends was not being applied properly when the context indicated that execution would be via an induced method. (Before, we always checked the native micro-kernel corresponding to the datatype being executed, whereas now we check the native micro-kernel corresponding to the datatype's real projection, since that is the micro-kernel that is actually used by induced methods. - Added an option to the testsuite to skip the testing of native level-3 complex implementations. Previously, it was always tested, provided that the c/z datatypes were enabled. However, some configurations use reference micro-kernels for complex datatypes, and testing these implementations can slow down the testsuite considerably. --- frame/1/bli_l1v.h | 10 +- frame/1/{ => other}/packv/bli_packv.c | 0 frame/1/{ => other}/packv/bli_packv.h | 0 frame/1/{ => other}/packv/bli_packv_check.c | 0 frame/1/{ => other}/packv/bli_packv_check.h | 0 frame/1/{ => other}/packv/bli_packv_cntl.c | 38 ++ .../other/packv/bli_packv_cntl.h} | 65 +-- frame/1/{ => other}/packv/bli_packv_init.c | 111 +---- frame/1/{ => other}/packv/bli_packv_init.h | 15 +- frame/1/{ => other}/packv/bli_packv_int.c | 32 +- frame/1/{ => other}/packv/bli_packv_int.h | 0 .../1/{ => other}/packv/bli_packv_unb_var1.c | 0 .../1/{ => other}/packv/bli_packv_unb_var1.h | 0 frame/1/{ => other}/scalv/bli_scalv_cntl.c | 0 frame/1/{ => other}/scalv/bli_scalv_cntl.h | 0 frame/1/{ => other}/scalv/bli_scalv_int.c | 0 frame/1/{ => other}/scalv/bli_scalv_int.h | 0 frame/1/{ => other}/unpackv/bli_unpackv.c | 0 frame/1/{ => other}/unpackv/bli_unpackv.h | 0 .../1/{ => other}/unpackv/bli_unpackv_check.c | 0 .../1/{ => other}/unpackv/bli_unpackv_check.h | 0 .../1/{ => other}/unpackv/bli_unpackv_cntl.c | 0 .../1/{ => other}/unpackv/bli_unpackv_cntl.h | 0 frame/1/{ => other}/unpackv/bli_unpackv_int.c | 0 frame/1/{ => other}/unpackv/bli_unpackv_int.h | 0 .../unpackv/bli_unpackv_unb_var1.c | 0 .../unpackv/bli_unpackv_unb_var1.h | 0 frame/1m/bli_l1m.h | 4 +- frame/1m/bli_l1m_voft.h | 75 +++ frame/1m/packm/bli_packm_blk_var1.c | 115 +++-- frame/1m/packm/bli_packm_blk_var1.h | 12 +- frame/1m/packm/bli_packm_check.c | 18 +- frame/1m/packm/bli_packm_check.h | 19 +- frame/1m/packm/bli_packm_cntl.c | 136 ++--- frame/1m/packm/bli_packm_cntl.h | 101 ++-- frame/1m/packm/bli_packm_cntx.c | 2 +- frame/1m/packm/bli_packm_init.c | 331 ++++--------- frame/1m/packm/bli_packm_init.h | 42 +- frame/1m/packm/bli_packm_int.c | 70 +-- frame/1m/packm/bli_packm_int.h | 14 +- frame/1m/packm/bli_packm_thrinfo.c | 19 +- frame/1m/packm/bli_packm_thrinfo.h | 6 +- frame/1m/packm/bli_packm_unb_var1.c | 12 +- frame/1m/packm/bli_packm_unb_var1.h | 12 +- .../{cntl/bli_cntl.c => 1m/scalm/bli_scalm.h} | 6 +- frame/1m/scalm/bli_scalm_cntl.c | 47 +- frame/1m/scalm/bli_scalm_cntl.h | 21 +- frame/1m/scalm/{ => other}/bli_scalm_int.c | 0 frame/1m/scalm/{ => other}/bli_scalm_int.h | 0 frame/1m/unpackm/bli_unpackm.h | 3 +- ...ackm_blk_var2.c => bli_unpackm_blk_var1.c} | 16 +- frame/1m/unpackm/bli_unpackm_blk_var1.h | 66 +++ frame/1m/unpackm/bli_unpackm_check.c | 10 +- frame/1m/unpackm/bli_unpackm_check.h | 11 +- frame/1m/unpackm/bli_unpackm_cntl.c | 55 +- frame/1m/unpackm/bli_unpackm_cntl.h | 37 +- frame/1m/unpackm/bli_unpackm_cxk.c | 19 +- frame/1m/unpackm/bli_unpackm_int.c | 199 +------- frame/1m/unpackm/bli_unpackm_int.h | 18 +- frame/1m/unpackm/bli_unpackm_unb_var1.c | 12 +- frame/1m/unpackm/bli_unpackm_unb_var1.h | 12 +- frame/2/gemv/bli_gemv.h | 7 +- frame/2/gemv/bli_gemv_var.h | 2 +- frame/2/gemv/bli_gemv_var_oapi.c | 2 +- frame/2/gemv/old/bli_gemv_var_oapi.c.prev | 97 ++++ frame/2/gemv/{ => other}/bli_gemv_blk_var1.c | 0 frame/2/gemv/{ => other}/bli_gemv_blk_var2.c | 0 frame/2/gemv/{ => other}/bli_gemv_cntl.c | 0 frame/2/gemv/{ => other}/bli_gemv_cntl.h | 0 frame/2/gemv/{ => other}/bli_gemv_front.c | 0 frame/2/gemv/{ => other}/bli_gemv_front.h | 0 frame/2/gemv/{ => other}/bli_gemv_int.c | 0 frame/2/gemv/{ => other}/bli_gemv_int.h | 0 frame/2/ger/bli_ger.h | 7 +- frame/2/ger/bli_ger_var.h | 2 +- frame/2/ger/bli_ger_var_oapi.c | 2 +- frame/2/ger/{ => other}/bli_ger_blk_var1.c | 0 frame/2/ger/{ => other}/bli_ger_blk_var2.c | 0 frame/2/ger/{ => other}/bli_ger_cntl.c | 0 frame/2/ger/{ => other}/bli_ger_cntl.h | 0 frame/2/ger/{ => other}/bli_ger_front.c | 0 frame/2/ger/{ => other}/bli_ger_front.h | 0 frame/2/ger/{ => other}/bli_ger_int.c | 0 frame/2/ger/{ => other}/bli_ger_int.h | 0 frame/2/hemv/bli_hemv.h | 7 +- frame/2/hemv/bli_hemv_var.h | 2 +- frame/2/hemv/bli_hemv_var_oapi.c | 2 +- frame/2/hemv/{ => other}/bli_hemv_blk_var1.c | 0 frame/2/hemv/{ => other}/bli_hemv_blk_var2.c | 0 frame/2/hemv/{ => other}/bli_hemv_blk_var3.c | 0 frame/2/hemv/{ => other}/bli_hemv_blk_var4.c | 0 frame/2/hemv/{ => other}/bli_hemv_cntl.c | 0 frame/2/hemv/{ => other}/bli_hemv_cntl.h | 0 frame/2/hemv/{ => other}/bli_hemv_front.c | 0 frame/2/hemv/{ => other}/bli_hemv_front.h | 0 frame/2/hemv/{ => other}/bli_hemv_int.c | 0 frame/2/hemv/{ => other}/bli_hemv_int.h | 0 frame/2/her/bli_her.h | 7 +- frame/2/her/bli_her_var.h | 2 +- frame/2/her/bli_her_var_oapi.c | 2 +- frame/2/her/{ => other}/bli_her_blk_var1.c | 0 frame/2/her/{ => other}/bli_her_blk_var2.c | 0 frame/2/her/{ => other}/bli_her_cntl.c | 0 frame/2/her/{ => other}/bli_her_cntl.h | 0 frame/2/her/{ => other}/bli_her_front.c | 0 frame/2/her/{ => other}/bli_her_front.h | 0 frame/2/her/{ => other}/bli_her_int.c | 0 frame/2/her/{ => other}/bli_her_int.h | 0 frame/2/her2/bli_her2.h | 7 +- frame/2/her2/bli_her2_var.h | 2 +- frame/2/her2/bli_her2_var_oapi.c | 2 +- frame/2/her2/{ => other}/bli_her2_blk_var1.c | 0 frame/2/her2/{ => other}/bli_her2_blk_var2.c | 0 frame/2/her2/{ => other}/bli_her2_blk_var3.c | 0 frame/2/her2/{ => other}/bli_her2_blk_var4.c | 0 frame/2/her2/{ => other}/bli_her2_cntl.c | 0 frame/2/her2/{ => other}/bli_her2_cntl.h | 0 frame/2/her2/{ => other}/bli_her2_front.c | 0 frame/2/her2/{ => other}/bli_her2_front.h | 0 frame/2/her2/{ => other}/bli_her2_int.c | 0 frame/2/her2/{ => other}/bli_her2_int.h | 0 frame/2/symv/bli_symv.h | 3 +- frame/2/symv/{ => other}/bli_symv_front.c | 0 frame/2/symv/{ => other}/bli_symv_front.h | 0 frame/2/syr/bli_syr.h | 3 +- frame/2/syr/{ => other}/bli_syr_front.c | 0 frame/2/syr/{ => other}/bli_syr_front.h | 0 frame/2/syr2/bli_syr2.h | 3 +- frame/2/syr2/{ => other}/bli_syr2_front.c | 0 frame/2/syr2/{ => other}/bli_syr2_front.h | 0 frame/2/trmv/bli_trmv.h | 7 +- frame/2/trmv/bli_trmv_var.h | 2 +- frame/2/trmv/bli_trmv_var_oapi.c | 2 +- frame/2/trmv/{ => other}/bli_trmv_cntl.c | 0 frame/2/trmv/{ => other}/bli_trmv_cntl.h | 0 frame/2/trmv/{ => other}/bli_trmv_front.c | 0 frame/2/trmv/{ => other}/bli_trmv_front.h | 0 frame/2/trmv/{ => other}/bli_trmv_int.c | 0 frame/2/trmv/{ => other}/bli_trmv_int.h | 0 .../2/trmv/{ => other}/bli_trmv_l_blk_var1.c | 0 .../2/trmv/{ => other}/bli_trmv_l_blk_var2.c | 0 .../2/trmv/{ => other}/bli_trmv_u_blk_var1.c | 0 .../2/trmv/{ => other}/bli_trmv_u_blk_var2.c | 0 frame/2/trsv/bli_trsv.h | 7 +- frame/2/trsv/bli_trsv_var.h | 2 +- frame/2/trsv/bli_trsv_var_oapi.c | 2 +- frame/2/trsv/{ => other}/bli_trsv_cntl.c | 0 frame/2/trsv/{ => other}/bli_trsv_cntl.h | 0 frame/2/trsv/{ => other}/bli_trsv_front.c | 0 frame/2/trsv/{ => other}/bli_trsv_front.h | 0 frame/2/trsv/{ => other}/bli_trsv_int.c | 0 frame/2/trsv/{ => other}/bli_trsv_int.h | 0 .../2/trsv/{ => other}/bli_trsv_l_blk_var1.c | 0 .../2/trsv/{ => other}/bli_trsv_l_blk_var2.c | 0 .../2/trsv/{ => other}/bli_trsv_u_blk_var1.c | 0 .../2/trsv/{ => other}/bli_trsv_u_blk_var2.c | 0 frame/3/bli_l3.h | 6 +- frame/3/bli_l3_blocksize.c | 373 ++++---------- frame/3/bli_l3_blocksize.h | 6 + frame/3/bli_l3_check.c | 23 - frame/3/bli_l3_cntl.c | 114 +++++ .../{cntl/bli_cntl_init.h => 3/bli_l3_cntl.h} | 29 +- frame/3/bli_l3_direct.c | 22 + frame/3/bli_l3_direct.h | 9 + frame/3/bli_l3_packm.c | 171 +++++++ .../bli_gemm_blk_var4.h => bli_l3_packm.h} | 11 +- frame/3/bli_l3_prune.c | 80 +++ frame/3/bli_l3_prune.h | 21 + frame/3/bli_l3_thrinfo.c | 123 +++-- frame/3/bli_l3_thrinfo.h | 8 +- frame/3/bli_l3_voft.h | 76 +++ frame/3/gemm/bli_gemm_blk_var1.c | 111 +---- frame/3/gemm/bli_gemm_blk_var2.c | 109 +--- frame/3/gemm/bli_gemm_blk_var3.c | 131 ++--- frame/3/gemm/bli_gemm_cntl.c | 203 +++----- frame/3/gemm/bli_gemm_cntl.h | 42 +- frame/3/gemm/bli_gemm_front.c | 39 +- frame/3/gemm/bli_gemm_front.h | 2 +- frame/3/gemm/bli_gemm_int.c | 54 +- frame/3/gemm/bli_gemm_int.h | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 4 +- .../bli_gemm_ker_var3.h => bli_gemm_packab.c} | 95 ++-- frame/3/gemm/bli_gemm_var.h | 14 +- ...gemm_ker_var4.c => bli_gemm3m2_ker_var2.c} | 14 +- frame/3/gemm/ind/bli_gemm3m3_packa.c | 142 ++++++ ...gemm_ker_var3.c => bli_gemm4mb_ker_var2.c} | 10 +- frame/3/gemm/ind/bli_gemm_blk_var4.c | 229 --------- frame/3/hemm/bli_hemm_front.c | 39 +- frame/3/hemm/bli_hemm_front.h | 2 +- frame/3/her2k/bli_her2k_front.c | 75 ++- frame/3/her2k/bli_her2k_front.h | 2 +- frame/3/herk/bli_herk.h | 1 - frame/3/herk/bli_herk_blk_var1.c | 158 ------ frame/3/herk/bli_herk_blk_var2.c | 157 ------ frame/3/herk/bli_herk_blk_var3.c | 166 ------- frame/3/herk/bli_herk_front.c | 41 +- frame/3/herk/bli_herk_front.h | 2 +- frame/3/herk/bli_herk_l_ker_var2.c | 4 +- frame/3/herk/bli_herk_u_ker_var2.c | 4 +- frame/3/herk/bli_herk_var.h | 10 +- frame/3/herk/bli_herk_x_ker_var2.c | 2 +- .../herk/old/bli_herk_blk_var1.c} | 114 ++--- frame/3/herk/old/bli_herk_blk_var2.c | 98 ++++ frame/3/herk/old/bli_herk_blk_var3.c | 105 ++++ frame/3/herk/{ => old}/bli_herk_int.c | 26 +- frame/3/herk/{ => old}/bli_herk_int.h | 2 +- frame/3/symm/bli_symm_front.c | 41 +- frame/3/symm/bli_symm_front.h | 2 +- frame/3/syr2k/bli_syr2k_front.c | 77 ++- frame/3/syr2k/bli_syr2k_front.h | 2 +- frame/3/syrk/bli_syrk_front.c | 41 +- frame/3/syrk/bli_syrk_front.h | 2 +- frame/3/trmm/bli_trmm.h | 1 - frame/3/trmm/bli_trmm_blk_var1.c | 157 ------ frame/3/trmm/bli_trmm_blk_var2.c | 156 ------ frame/3/trmm/bli_trmm_blk_var3.c | 160 ------ frame/3/trmm/bli_trmm_front.c | 40 +- frame/3/trmm/bli_trmm_front.h | 2 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_var.h | 8 +- frame/3/trmm/bli_trmm_xx_ker_var2.c | 2 +- frame/3/trmm/old/bli_trmm_blk_var1.c | 98 ++++ frame/3/trmm/old/bli_trmm_blk_var2.c | 98 ++++ frame/3/trmm/old/bli_trmm_blk_var3.c | 105 ++++ frame/3/trmm/{ => old}/bli_trmm_int.c | 10 +- frame/3/trmm/{ => old}/bli_trmm_int.h | 2 +- frame/3/trmm3/bli_trmm3_front.c | 39 +- frame/3/trmm3/bli_trmm3_front.h | 2 +- frame/3/trsm/bli_trsm_blk_var1.c | 87 +--- frame/3/trsm/bli_trsm_blk_var2.c | 109 +--- frame/3/trsm/bli_trsm_blk_var3.c | 118 +---- frame/3/trsm/bli_trsm_cntl.c | 378 ++++++-------- frame/3/trsm/bli_trsm_cntl.h | 54 +- frame/3/trsm/bli_trsm_front.c | 43 +- frame/3/trsm/bli_trsm_front.h | 3 +- frame/3/trsm/bli_trsm_int.c | 59 +-- frame/3/trsm/bli_trsm_int.h | 2 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 11 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 11 +- .../bli_trsm_packab.c} | 95 ++-- frame/3/trsm/bli_trsm_rl_ker_var2.c | 11 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 11 +- frame/3/trsm/bli_trsm_var.h | 4 +- frame/3/trsm/bli_trsm_xx_ker_var2.c | 2 +- frame/3/trsm/old/bli_trsm_cntl.c | 268 ++++++++++ .../trsm/old/bli_trsm_cntl.h} | 49 +- .../bli_auxinfo.h} | 55 +- frame/base/bli_cntl.c | 186 +++++++ frame/base/bli_cntl.h | 153 ++++++ frame/base/bli_cntx.c | 51 +- frame/base/bli_cntx.h | 20 + frame/base/bli_info.c | 6 +- frame/base/bli_init.c | 8 +- frame/base/bli_mem.h | 91 +++- frame/base/bli_memsys.c | 174 +++++++ frame/base/bli_memsys.h | 52 ++ frame/base/bli_obj.c | 10 - frame/base/old/bli_mem.c.prev | 366 ++++++++++++++ frame/include/bli_extern_defs.h | 1 - frame/include/bli_macro_defs.h | 2 - frame/include/bli_mem_macro_defs.h | 126 ----- frame/include/bli_obj_macro_defs.h | 53 +- frame/include/bli_type_defs.h | 468 +++++++++--------- frame/include/blis.h | 2 + frame/ind/oapi/bli_l3_3m4m_oapi.c | 39 +- frame/ind/oapi/bli_l3_nat_oapi.c | 21 +- frame/thread/bli_thrcomm_openmp.c | 33 +- frame/thread/bli_thrcomm_pthreads.c | 119 +++-- frame/thread/bli_thrcomm_single.c | 56 ++- frame/thread/bli_thread.c | 140 ++++-- frame/thread/bli_thread.h | 45 +- frame/thread/bli_thrinfo.c | 63 +-- frame/thread/bli_thrinfo.h | 28 +- testsuite/input.general | 1 + testsuite/src/test_gemm_ukr.c | 53 +- testsuite/src/test_gemmtrsm_ukr.c | 59 +-- testsuite/src/test_libblis.c | 60 ++- testsuite/src/test_libblis.h | 2 +- testsuite/src/test_trsm_ukr.c | 54 +- 282 files changed, 5386 insertions(+), 4842 deletions(-) rename frame/1/{ => other}/packv/bli_packv.c (100%) rename frame/1/{ => other}/packv/bli_packv.h (100%) rename frame/1/{ => other}/packv/bli_packv_check.c (100%) rename frame/1/{ => other}/packv/bli_packv_check.h (100%) rename frame/1/{ => other}/packv/bli_packv_cntl.c (75%) rename frame/{cntl/bli_cntl.h => 1/other/packv/bli_packv_cntl.h} (67%) rename frame/1/{ => other}/packv/bli_packv_init.c (70%) rename frame/1/{ => other}/packv/bli_packv_init.h (88%) rename frame/1/{ => other}/packv/bli_packv_int.c (85%) rename frame/1/{ => other}/packv/bli_packv_int.h (100%) rename frame/1/{ => other}/packv/bli_packv_unb_var1.c (100%) rename frame/1/{ => other}/packv/bli_packv_unb_var1.h (100%) rename frame/1/{ => other}/scalv/bli_scalv_cntl.c (100%) rename frame/1/{ => other}/scalv/bli_scalv_cntl.h (100%) rename frame/1/{ => other}/scalv/bli_scalv_int.c (100%) rename frame/1/{ => other}/scalv/bli_scalv_int.h (100%) rename frame/1/{ => other}/unpackv/bli_unpackv.c (100%) rename frame/1/{ => other}/unpackv/bli_unpackv.h (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_check.c (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_check.h (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_cntl.c (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_cntl.h (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_int.c (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_int.h (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_unb_var1.c (100%) rename frame/1/{ => other}/unpackv/bli_unpackv_unb_var1.h (100%) create mode 100644 frame/1m/bli_l1m_voft.h rename frame/{cntl/bli_cntl.c => 1m/scalm/bli_scalm.h} (95%) rename frame/1m/scalm/{ => other}/bli_scalm_int.c (100%) rename frame/1m/scalm/{ => other}/bli_scalm_int.h (100%) rename frame/1m/unpackm/{bli_unpackm_blk_var2.c => bli_unpackm_blk_var1.c} (96%) create mode 100644 frame/1m/unpackm/bli_unpackm_blk_var1.h create mode 100644 frame/2/gemv/old/bli_gemv_var_oapi.c.prev rename frame/2/gemv/{ => other}/bli_gemv_blk_var1.c (100%) rename frame/2/gemv/{ => other}/bli_gemv_blk_var2.c (100%) rename frame/2/gemv/{ => other}/bli_gemv_cntl.c (100%) rename frame/2/gemv/{ => other}/bli_gemv_cntl.h (100%) rename frame/2/gemv/{ => other}/bli_gemv_front.c (100%) rename frame/2/gemv/{ => other}/bli_gemv_front.h (100%) rename frame/2/gemv/{ => other}/bli_gemv_int.c (100%) rename frame/2/gemv/{ => other}/bli_gemv_int.h (100%) rename frame/2/ger/{ => other}/bli_ger_blk_var1.c (100%) rename frame/2/ger/{ => other}/bli_ger_blk_var2.c (100%) rename frame/2/ger/{ => other}/bli_ger_cntl.c (100%) rename frame/2/ger/{ => other}/bli_ger_cntl.h (100%) rename frame/2/ger/{ => other}/bli_ger_front.c (100%) rename frame/2/ger/{ => other}/bli_ger_front.h (100%) rename frame/2/ger/{ => other}/bli_ger_int.c (100%) rename frame/2/ger/{ => other}/bli_ger_int.h (100%) rename frame/2/hemv/{ => other}/bli_hemv_blk_var1.c (100%) rename frame/2/hemv/{ => other}/bli_hemv_blk_var2.c (100%) rename frame/2/hemv/{ => other}/bli_hemv_blk_var3.c (100%) rename frame/2/hemv/{ => other}/bli_hemv_blk_var4.c (100%) rename frame/2/hemv/{ => other}/bli_hemv_cntl.c (100%) rename frame/2/hemv/{ => other}/bli_hemv_cntl.h (100%) rename frame/2/hemv/{ => other}/bli_hemv_front.c (100%) rename frame/2/hemv/{ => other}/bli_hemv_front.h (100%) rename frame/2/hemv/{ => other}/bli_hemv_int.c (100%) rename frame/2/hemv/{ => other}/bli_hemv_int.h (100%) rename frame/2/her/{ => other}/bli_her_blk_var1.c (100%) rename frame/2/her/{ => other}/bli_her_blk_var2.c (100%) rename frame/2/her/{ => other}/bli_her_cntl.c (100%) rename frame/2/her/{ => other}/bli_her_cntl.h (100%) rename frame/2/her/{ => other}/bli_her_front.c (100%) rename frame/2/her/{ => other}/bli_her_front.h (100%) rename frame/2/her/{ => other}/bli_her_int.c (100%) rename frame/2/her/{ => other}/bli_her_int.h (100%) rename frame/2/her2/{ => other}/bli_her2_blk_var1.c (100%) rename frame/2/her2/{ => other}/bli_her2_blk_var2.c (100%) rename frame/2/her2/{ => other}/bli_her2_blk_var3.c (100%) rename frame/2/her2/{ => other}/bli_her2_blk_var4.c (100%) rename frame/2/her2/{ => other}/bli_her2_cntl.c (100%) rename frame/2/her2/{ => other}/bli_her2_cntl.h (100%) rename frame/2/her2/{ => other}/bli_her2_front.c (100%) rename frame/2/her2/{ => other}/bli_her2_front.h (100%) rename frame/2/her2/{ => other}/bli_her2_int.c (100%) rename frame/2/her2/{ => other}/bli_her2_int.h (100%) rename frame/2/symv/{ => other}/bli_symv_front.c (100%) rename frame/2/symv/{ => other}/bli_symv_front.h (100%) rename frame/2/syr/{ => other}/bli_syr_front.c (100%) rename frame/2/syr/{ => other}/bli_syr_front.h (100%) rename frame/2/syr2/{ => other}/bli_syr2_front.c (100%) rename frame/2/syr2/{ => other}/bli_syr2_front.h (100%) rename frame/2/trmv/{ => other}/bli_trmv_cntl.c (100%) rename frame/2/trmv/{ => other}/bli_trmv_cntl.h (100%) rename frame/2/trmv/{ => other}/bli_trmv_front.c (100%) rename frame/2/trmv/{ => other}/bli_trmv_front.h (100%) rename frame/2/trmv/{ => other}/bli_trmv_int.c (100%) rename frame/2/trmv/{ => other}/bli_trmv_int.h (100%) rename frame/2/trmv/{ => other}/bli_trmv_l_blk_var1.c (100%) rename frame/2/trmv/{ => other}/bli_trmv_l_blk_var2.c (100%) rename frame/2/trmv/{ => other}/bli_trmv_u_blk_var1.c (100%) rename frame/2/trmv/{ => other}/bli_trmv_u_blk_var2.c (100%) rename frame/2/trsv/{ => other}/bli_trsv_cntl.c (100%) rename frame/2/trsv/{ => other}/bli_trsv_cntl.h (100%) rename frame/2/trsv/{ => other}/bli_trsv_front.c (100%) rename frame/2/trsv/{ => other}/bli_trsv_front.h (100%) rename frame/2/trsv/{ => other}/bli_trsv_int.c (100%) rename frame/2/trsv/{ => other}/bli_trsv_int.h (100%) rename frame/2/trsv/{ => other}/bli_trsv_l_blk_var1.c (100%) rename frame/2/trsv/{ => other}/bli_trsv_l_blk_var2.c (100%) rename frame/2/trsv/{ => other}/bli_trsv_u_blk_var1.c (100%) rename frame/2/trsv/{ => other}/bli_trsv_u_blk_var2.c (100%) create mode 100644 frame/3/bli_l3_cntl.c rename frame/{cntl/bli_cntl_init.h => 3/bli_l3_cntl.h} (79%) create mode 100644 frame/3/bli_l3_packm.c rename frame/3/{gemm/ind/bli_gemm_blk_var4.h => bli_l3_packm.h} (94%) create mode 100644 frame/3/bli_l3_voft.h rename frame/3/gemm/{ind/bli_gemm_ker_var3.h => bli_gemm_packab.c} (63%) rename frame/3/gemm/ind/{bli_gemm_ker_var4.c => bli_gemm3m2_ker_var2.c} (96%) create mode 100644 frame/3/gemm/ind/bli_gemm3m3_packa.c rename frame/3/gemm/ind/{bli_gemm_ker_var3.c => bli_gemm4mb_ker_var2.c} (98%) delete mode 100644 frame/3/gemm/ind/bli_gemm_blk_var4.c delete mode 100644 frame/3/herk/bli_herk_blk_var1.c delete mode 100644 frame/3/herk/bli_herk_blk_var2.c delete mode 100644 frame/3/herk/bli_herk_blk_var3.c rename frame/{cntl/bli_cntl_init.c => 3/herk/old/bli_herk_blk_var1.c} (55%) create mode 100644 frame/3/herk/old/bli_herk_blk_var2.c create mode 100644 frame/3/herk/old/bli_herk_blk_var3.c rename frame/3/herk/{ => old}/bli_herk_int.c (91%) rename frame/3/herk/{ => old}/bli_herk_int.h (98%) delete mode 100644 frame/3/trmm/bli_trmm_blk_var1.c delete mode 100644 frame/3/trmm/bli_trmm_blk_var2.c delete mode 100644 frame/3/trmm/bli_trmm_blk_var3.c create mode 100644 frame/3/trmm/old/bli_trmm_blk_var1.c create mode 100644 frame/3/trmm/old/bli_trmm_blk_var2.c create mode 100644 frame/3/trmm/old/bli_trmm_blk_var3.c rename frame/3/trmm/{ => old}/bli_trmm_int.c (98%) rename frame/3/trmm/{ => old}/bli_trmm_int.h (98%) rename frame/3/{gemm/ind/bli_gemm_ker_var4.h => trsm/bli_trsm_packab.c} (63%) create mode 100644 frame/3/trsm/old/bli_trsm_cntl.c rename frame/{1/packv/bli_packv_cntl.h => 3/trsm/old/bli_trsm_cntl.h} (61%) rename frame/{1m/unpackm/bli_unpackm_blk_var2.h => base/bli_auxinfo.h} (59%) create mode 100644 frame/base/bli_cntl.c create mode 100644 frame/base/bli_cntl.h create mode 100644 frame/base/bli_memsys.c create mode 100644 frame/base/bli_memsys.h create mode 100644 frame/base/old/bli_mem.c.prev delete mode 100644 frame/include/bli_mem_macro_defs.h diff --git a/frame/1/bli_l1v.h b/frame/1/bli_l1v.h index f557118f0..bd4879247 100644 --- a/frame/1/bli_l1v.h +++ b/frame/1/bli_l1v.h @@ -46,12 +46,14 @@ #include "bli_l1v_tapi.h" // Pack-related -#include "bli_packv.h" -#include "bli_unpackv.h" +// NOTE: packv and unpackv are temporarily disabled. +//#include "bli_packv.h" +//#include "bli_unpackv.h" // Other -#include "bli_scalv_cntl.h" -#include "bli_scalv_int.h" +// NOTE: scalv control tree code is temporarily disabled. +//#include "bli_scalv_cntl.h" +//#include "bli_scalv_int.h" // Reference kernel headers #include "bli_l1v_ref.h" diff --git a/frame/1/packv/bli_packv.c b/frame/1/other/packv/bli_packv.c similarity index 100% rename from frame/1/packv/bli_packv.c rename to frame/1/other/packv/bli_packv.c diff --git a/frame/1/packv/bli_packv.h b/frame/1/other/packv/bli_packv.h similarity index 100% rename from frame/1/packv/bli_packv.h rename to frame/1/other/packv/bli_packv.h diff --git a/frame/1/packv/bli_packv_check.c b/frame/1/other/packv/bli_packv_check.c similarity index 100% rename from frame/1/packv/bli_packv_check.c rename to frame/1/other/packv/bli_packv_check.c diff --git a/frame/1/packv/bli_packv_check.h b/frame/1/other/packv/bli_packv_check.h similarity index 100% rename from frame/1/packv/bli_packv_check.h rename to frame/1/other/packv/bli_packv_check.h diff --git a/frame/1/packv/bli_packv_cntl.c b/frame/1/other/packv/bli_packv_cntl.c similarity index 75% rename from frame/1/packv/bli_packv_cntl.c rename to frame/1/other/packv/bli_packv_cntl.c index ac068ce71..13f90a429 100644 --- a/frame/1/packv/bli_packv_cntl.c +++ b/frame/1/other/packv/bli_packv_cntl.c @@ -34,6 +34,7 @@ #include "blis.h" +#if 0 packv_t* packv_cntl = NULL; void bli_packv_cntl_init( void ) @@ -77,4 +78,41 @@ void bli_packv_cntl_obj_init( packv_t* cntl, cntl->bmid = bmid; cntl->pack_schema = pack_schema; } +#endif + +cntl_t* bli_packv_cntl_obj_create + ( + void* var_func, + void* packv_var_func, + bszid_t bmid, + pack_t pack_schema, + cntl_t* sub_node + ) +{ + cntl_t* cntl; + packv_params_t* params; + + // Allocate a packv_params_t struct. + params = bli_malloc_intl( sizeof( packv_params_t ) ); + + // Initialize the packv_params_t struct. + params->size = sizeof( packv_params_t ); + params->packv_var_func = packv_var_func; + params->bmid = bmid; + params->pack_schema = pack_schema; + + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); + + return cntl; +} diff --git a/frame/cntl/bli_cntl.h b/frame/1/other/packv/bli_packv_cntl.h similarity index 67% rename from frame/cntl/bli_cntl.h rename to frame/1/other/packv/bli_packv_cntl.h index c53270f9b..1fc265338 100644 --- a/frame/cntl/bli_cntl.h +++ b/frame/1/other/packv/bli_packv_cntl.h @@ -32,53 +32,36 @@ */ -#include "bli_cntl_init.h" - -typedef enum +struct packv_params_s { - BLIS_UNBLOCKED = 0, - BLIS_UNB_FUSED = 1, - BLIS_UNB_OPT = 1, - BLIS_BLOCKED = 2 -} impl_t; - -typedef enum -{ - BLIS_VARIANT1 = 0, - BLIS_VARIANT2, - BLIS_VARIANT3, - BLIS_VARIANT4, - BLIS_VARIANT5, - BLIS_VARIANT6, - BLIS_VARIANT7, - BLIS_VARIANT8, - BLIS_VARIANT9, -} varnum_t; + uint64_t size + packv_voft* var_func; + bszid_t bmid; + pack_t pack_schema; +}; +typedef struct packv_params_s packv_params_t; -void bli_cntl_obj_free( void* cntl ); - - - -// -- Control tree accessor macros (common to many node types) -- - -#define bli_cntl_impl_type( cntl ) cntl->impl_type -#define bli_cntl_var_num( cntl ) cntl->var_num -#define bli_cntl_bszid( cntl ) cntl->bszid - - - -// -- Control tree query macros -- - -#define bli_cntl_is_noop( cntl ) \ +#define bli_cntl_packv_params_var_func( cntl ) \ \ - ( cntl == NULL ) + ( (packv_params_t*)( cntl->params )->var_func ) -#define bli_cntl_is_leaf( cntl ) \ +#define bli_cntl_packv_params_bmid( cntl ) \ \ - ( bli_cntl_impl_type( cntl ) != BLIS_BLOCKED ) + ( (packv_params_t*)( cntl->params )->bmid_m ) -#define bli_cntl_is_blocked( cntl ) \ +#define bli_cntl_packv_params_pack_schema( cntl ) \ \ - ( bli_cntl_impl_type( cntl ) == BLIS_BLOCKED ) + ( (packv_params_t*)( cntl->params )->pack_schema ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_packv_cntl_obj_create + ( + void* var_func, + void* packv_var_func, + bszid_t bmid, + pack_t pack_schema, + cntl_t* sub_node + ); diff --git a/frame/1/packv/bli_packv_init.c b/frame/1/other/packv/bli_packv_init.c similarity index 70% rename from frame/1/packv/bli_packv_init.c rename to frame/1/other/packv/bli_packv_init.c index c43931272..01b8f3cdd 100644 --- a/frame/1/packv/bli_packv_init.c +++ b/frame/1/other/packv/bli_packv_init.c @@ -52,7 +52,6 @@ void bli_packv_init pack_t pack_schema; bszid_t bmult_id; - obj_t c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -84,26 +83,6 @@ void bli_packv_init // left is whether we are to typecast vector a before packing. if ( bli_obj_datatype( *a ) != bli_obj_target_datatype( *a ) ) bli_abort(); -/* - { - // Initialize an object c for the intermediate typecast vector. - bli_packv_init_cast( a, - p, - &c ); - - // Copy/typecast vector a to vector c. - bli_copyv( a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // vector serves as a minor optimization. This causes the packv - // implementation to pack directly from vector a. - bli_obj_alias_to( *a, c ); - } - // Extract various fields from the control tree and pass them in // explicitly into _init_pack(). This allows external code generators @@ -116,7 +95,7 @@ void bli_packv_init ( pack_schema, bmult_id, - &c, + &a, p, cntx ); @@ -125,22 +104,24 @@ void bli_packv_init } -void bli_packv_init_pack +siz_t bli_packv_init_pack ( - pack_t pack_schema, + pack_t schema, bszid_t bmult_id, - obj_t* c, + obj_t* a, obj_t* p, cntx_t* cntx ) { - num_t dt = bli_obj_datatype( *c ); - dim_t dim_c = bli_obj_vector_dim( *c ); + num_t dt = bli_obj_datatype( *a ); + dim_t dim_a = bli_obj_vector_dim( *a ); dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); membrk_t* membrk = bli_cntx_membrk( cntx ); +#if 0 mem_t* mem_p; +#endif dim_t m_p_pad; siz_t size_p; inc_t rs_p, cs_p; @@ -148,21 +129,17 @@ void bli_packv_init_pack // We begin by copying the basic fields of c. - bli_obj_alias_to( *c, *p ); + bli_obj_alias_to( *a, *p ); // Update the dimensions. - bli_obj_set_dims( dim_c, 1, *p ); + bli_obj_set_dims( dim_a, 1, *p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, *p ); // Set the pack schema in the p object to the value in the control tree // node. - bli_obj_set_pack_schema( pack_schema, *p ); - - // Extract the address of the mem_t object within p that will track - // properties of the packed buffer. - mem_p = bli_obj_pack_mem( *p ); + bli_obj_set_pack_schema( schema, *p ); // Compute the dimensions padded by the dimension multiples. m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( *p ), bmult ); @@ -170,6 +147,11 @@ void bli_packv_init_pack // Compute the size of the packed buffer. size_p = m_p_pad * 1 * bli_obj_elem_size( *p ); +#if 0 + // Extract the address of the mem_t object within p that will track + // properties of the packed buffer. + mem_p = bli_obj_pack_mem( *p ); + if ( bli_mem_is_unalloc( mem_p ) ) { // If the mem_t object of p has not yet been allocated, then acquire @@ -192,19 +174,19 @@ void bli_packv_init_pack } } - // Save the padded (packed) dimensions into the packed object. - bli_obj_set_padded_dims( m_p_pad, 1, *p ); - // Grab the buffer address from the mem_t object and copy it to the // main object buffer field. (Sometimes this buffer address will be // copied when the value is already up-to-date, because it persists // in the main object buffer field across loop iterations.) buf = bli_mem_buffer( mem_p ); bli_obj_set_buffer( buf, *p ); +#endif + // Save the padded (packed) dimensions into the packed object. + bli_obj_set_padded_dims( m_p_pad, 1, *p ); // Set the row and column strides of p based on the pack schema. - if ( pack_schema == BLIS_PACKED_VECTOR ) + if ( schema == BLIS_PACKED_VECTOR ) { // Set the strides to reflect a column-stored vector. Note that the // column stride may never be used, and is only useful to determine @@ -215,8 +197,11 @@ void bli_packv_init_pack bli_obj_set_strides( rs_p, cs_p, *p ); } + + return size_p; } +#if 0 void bli_packv_release ( obj_t* p, @@ -226,52 +211,4 @@ void bli_packv_release if ( !bli_cntl_is_noop( cntl ) ) bli_obj_release_pack( p ); } - - -/* -void bli_packv_init_cast( obj_t* a, - obj_t* p, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) object c is marked as being stored in a standard, contiguous - // format (ie: a column vector), - // (3) the view offset of c is reset to (0,0), and - // (4) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available. (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t dim_a = bli_obj_vector_dim( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Update the dimensions. - bli_obj_set_dims( dim_a, 1, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect a column storage. - // Note that the column stride should never be used. - bli_obj_set_strides( 1, dim_a, *c ); -} -*/ - +#endif diff --git a/frame/1/packv/bli_packv_init.h b/frame/1/other/packv/bli_packv_init.h similarity index 88% rename from frame/1/packv/bli_packv_init.h rename to frame/1/other/packv/bli_packv_init.h index 03d12903c..6104bbdc7 100644 --- a/frame/1/packv/bli_packv_init.h +++ b/frame/1/other/packv/bli_packv_init.h @@ -40,23 +40,12 @@ void bli_packv_init packv_t* cntl ); -void bli_packv_init_pack +siz_t bli_packv_init_pack ( pack_t pack_schema, bszid_t bmult_id, - obj_t* c, + obj_t* a, obj_t* p, cntx_t* cntx ); -void bli_packv_release - ( - obj_t* p, - packv_t* cntl - ); - -/* -void bli_packv_init_cast( obj_t* a, - obj_t* p, - obj_t* c ); -*/ diff --git a/frame/1/packv/bli_packv_int.c b/frame/1/other/packv/bli_packv_int.c similarity index 85% rename from frame/1/packv/bli_packv_int.c rename to frame/1/other/packv/bli_packv_int.c index d22f0113e..75cbd193c 100644 --- a/frame/1/packv/bli_packv_int.c +++ b/frame/1/other/packv/bli_packv_int.c @@ -47,27 +47,23 @@ static FUNCPTR_T vars[1][3] = { bli_packv_unb_var1, NULL, NULL } }; -void bli_packv_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packv_t* cntl ) +void bli_packv_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ) { - // The packv operation consists of an optional typecasting pre-process. - // Here are the following possible ways packv can execute: - // 1. cast and pack: When typecasting and packing are both - // precribed, typecast a to temporary vector c and then pack - // c to p. - // 2. pack only: Typecasting is skipped when it is not needed; - // simply pack a directly to p. - // 3. cast only: Not yet supported / not used. - // 4. no-op: The control tree sometimes directs us to skip the - // pack operation entirely. Alias p to a and return. - - //obj_t c; - +#if 0 varnum_t n; impl_t i; - FUNCPTR_T f; +#endif + packv_voft f; + +// !!! +// DEFINE packv_voft type. +// !!! // Check parameters. if ( bli_error_checking_is_enabled() ) diff --git a/frame/1/packv/bli_packv_int.h b/frame/1/other/packv/bli_packv_int.h similarity index 100% rename from frame/1/packv/bli_packv_int.h rename to frame/1/other/packv/bli_packv_int.h diff --git a/frame/1/packv/bli_packv_unb_var1.c b/frame/1/other/packv/bli_packv_unb_var1.c similarity index 100% rename from frame/1/packv/bli_packv_unb_var1.c rename to frame/1/other/packv/bli_packv_unb_var1.c diff --git a/frame/1/packv/bli_packv_unb_var1.h b/frame/1/other/packv/bli_packv_unb_var1.h similarity index 100% rename from frame/1/packv/bli_packv_unb_var1.h rename to frame/1/other/packv/bli_packv_unb_var1.h diff --git a/frame/1/scalv/bli_scalv_cntl.c b/frame/1/other/scalv/bli_scalv_cntl.c similarity index 100% rename from frame/1/scalv/bli_scalv_cntl.c rename to frame/1/other/scalv/bli_scalv_cntl.c diff --git a/frame/1/scalv/bli_scalv_cntl.h b/frame/1/other/scalv/bli_scalv_cntl.h similarity index 100% rename from frame/1/scalv/bli_scalv_cntl.h rename to frame/1/other/scalv/bli_scalv_cntl.h diff --git a/frame/1/scalv/bli_scalv_int.c b/frame/1/other/scalv/bli_scalv_int.c similarity index 100% rename from frame/1/scalv/bli_scalv_int.c rename to frame/1/other/scalv/bli_scalv_int.c diff --git a/frame/1/scalv/bli_scalv_int.h b/frame/1/other/scalv/bli_scalv_int.h similarity index 100% rename from frame/1/scalv/bli_scalv_int.h rename to frame/1/other/scalv/bli_scalv_int.h diff --git a/frame/1/unpackv/bli_unpackv.c b/frame/1/other/unpackv/bli_unpackv.c similarity index 100% rename from frame/1/unpackv/bli_unpackv.c rename to frame/1/other/unpackv/bli_unpackv.c diff --git a/frame/1/unpackv/bli_unpackv.h b/frame/1/other/unpackv/bli_unpackv.h similarity index 100% rename from frame/1/unpackv/bli_unpackv.h rename to frame/1/other/unpackv/bli_unpackv.h diff --git a/frame/1/unpackv/bli_unpackv_check.c b/frame/1/other/unpackv/bli_unpackv_check.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_check.c rename to frame/1/other/unpackv/bli_unpackv_check.c diff --git a/frame/1/unpackv/bli_unpackv_check.h b/frame/1/other/unpackv/bli_unpackv_check.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_check.h rename to frame/1/other/unpackv/bli_unpackv_check.h diff --git a/frame/1/unpackv/bli_unpackv_cntl.c b/frame/1/other/unpackv/bli_unpackv_cntl.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_cntl.c rename to frame/1/other/unpackv/bli_unpackv_cntl.c diff --git a/frame/1/unpackv/bli_unpackv_cntl.h b/frame/1/other/unpackv/bli_unpackv_cntl.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_cntl.h rename to frame/1/other/unpackv/bli_unpackv_cntl.h diff --git a/frame/1/unpackv/bli_unpackv_int.c b/frame/1/other/unpackv/bli_unpackv_int.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_int.c rename to frame/1/other/unpackv/bli_unpackv_int.c diff --git a/frame/1/unpackv/bli_unpackv_int.h b/frame/1/other/unpackv/bli_unpackv_int.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_int.h rename to frame/1/other/unpackv/bli_unpackv_int.h diff --git a/frame/1/unpackv/bli_unpackv_unb_var1.c b/frame/1/other/unpackv/bli_unpackv_unb_var1.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_unb_var1.c rename to frame/1/other/unpackv/bli_unpackv_unb_var1.c diff --git a/frame/1/unpackv/bli_unpackv_unb_var1.h b/frame/1/other/unpackv/bli_unpackv_unb_var1.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_unb_var1.h rename to frame/1/other/unpackv/bli_unpackv_unb_var1.h diff --git a/frame/1m/bli_l1m.h b/frame/1m/bli_l1m.h index ff9c98459..5c55b97d3 100644 --- a/frame/1m/bli_l1m.h +++ b/frame/1m/bli_l1m.h @@ -36,6 +36,7 @@ #include "bli_l1m_check.h" #include "bli_l1m_ft.h" +#include "bli_l1m_voft.h" // Prototype object APIs with and without contexts. #include "bli_oapi_w_cntx.h" @@ -51,6 +52,5 @@ #include "bli_unpackm.h" // Other -#include "bli_scalm_cntl.h" -#include "bli_scalm_int.h" +#include "bli_scalm.h" diff --git a/frame/1m/bli_l1m_voft.h b/frame/1m/bli_l1m_voft.h new file mode 100644 index 000000000..f5fdf5b65 --- /dev/null +++ b/frame/1m/bli_l1m_voft.h @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L1M_VAR_OFT_H +#define BLIS_L1M_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* p, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( packm ) + + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* p, \ + obj_t* a, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( unpackm ) + + + +#endif + diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index cc8e84b2d..4ce7b1504 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -93,10 +93,14 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = }; -void bli_packm_blk_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ) +void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -140,7 +144,7 @@ void bli_packm_blk_var1( obj_t* c, // whether we are executing an induced method. if ( bli_is_nat_packed( schema ) ) { - // This branch if for native execution, where we assume that + // This branch is for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform @@ -156,28 +160,25 @@ void bli_packm_blk_var1( obj_t* c, // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if( bli_thread_am_ochief( t ) ) + if ( bli_obj_scalar_has_nonzero_imag( p ) ) { - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { -//printf( "applying non-zero imag kappa\n" ); - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + //printf( "applying non-zero imag kappa\n" ); + + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); + + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); + + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; } - kappa_p = bli_thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); @@ -194,7 +195,12 @@ void bli_packm_blk_var1( obj_t* c, bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; #else - func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); + // The original idea here was to read the packm_ukr from the context + // if it is non-NULL. The problem is, it requires that we be able to + // assume that the packm_ukr field is initialized to NULL, which it + // currently is not. + + //func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); //if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) ) { @@ -203,7 +209,6 @@ void bli_packm_blk_var1( obj_t* c, // we use the default lookup table to determine the right func_t // for the current schema. const dim_t i = bli_pack_schema_index( schema ); -//printf( "bli_packm_blk_var1: pack schema index = %lu (schema = %x)\n", i, schema ); packm_kers = &packm_struc_cxk_kers[ i ]; } @@ -221,11 +226,6 @@ void bli_packm_blk_var1( obj_t* c, // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_get_dt( dt_cp, packm_kers ); - -//bli_cntx_print( cntx ); -//printf( "bli_packm_blk_var1: packm_ker = %p\n", packm_ker ); -//printf( "bli_packm_blk_var1: cntx_packm_ker = %p\n", cntx_packm_kers ); -//printf( "bli_packm_blk_var1: local_table_entry = %p\n", &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ] ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; @@ -598,6 +598,57 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ p_inc = ps_p; \ } \ \ +/* +if ( col_stored ) { \ + if ( bli_thread_work_id( thread ) == 0 ) \ + { \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ + if ( bli_thread_work_id( thread ) == 1 ) \ + { \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ +} \ +else { \ + if ( bli_thread_work_id( thread ) == 0 ) \ + { \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ + if ( bli_thread_work_id( thread ) == 1 ) \ + { \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ +} \ +*/ \ +\ /* if ( bli_is_4mi_packed( schema ) ) { \ printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 8971da5c0..4e04f86f9 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -32,10 +32,14 @@ */ -void bli_packm_blk_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ); +void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ); #undef GENTPROT diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c index 6a56b8676..f8c66eee5 100644 --- a/frame/1m/packm/bli_packm_check.c +++ b/frame/1m/packm/bli_packm_check.c @@ -35,9 +35,12 @@ #include "blis.h" -void bli_packm_init_check( obj_t* a, - obj_t* p, - cntx_t* cntx ) +void bli_packm_init_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { err_t e_val; @@ -54,9 +57,12 @@ void bli_packm_init_check( obj_t* a, //bli_check_error_code( e_val ); } -void bli_packm_int_check( obj_t* a, - obj_t* p, - cntx_t* cntx ) +void bli_packm_int_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { err_t e_val; diff --git a/frame/1m/packm/bli_packm_check.h b/frame/1m/packm/bli_packm_check.h index 9974ced6b..9b2e8a66e 100644 --- a/frame/1m/packm/bli_packm_check.h +++ b/frame/1m/packm/bli_packm_check.h @@ -32,10 +32,17 @@ */ -void bli_packm_init_check( obj_t* a, - obj_t* p, - cntx_t* cntx ); +void bli_packm_init_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ); + +void bli_packm_int_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ); -void bli_packm_int_check( obj_t* a, - obj_t* p, - cntx_t* cntx ); diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index f0f674615..67b01fffb 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -34,109 +34,49 @@ #include "blis.h" -packm_t* packm_cntl_row = NULL; -packm_t* packm_cntl_col = NULL; - -packm_t* packm_cntl = NULL; - -void bli_packm_cntl_init() +cntl_t* bli_packm_cntl_obj_create + ( + void* var_func, + void* packm_var_func, + bszid_t bmid_m, + bszid_t bmid_n, + bool_t does_invert_diag, + bool_t rev_iter_if_upper, + bool_t rev_iter_if_lower, + pack_t pack_schema, + packbuf_t pack_buf_type, + cntl_t* sub_node + ) { - // Generally speaking, the BLIS_PACKED_ROWS and BLIS_PACKED_COLUMNS - // are used by the level-2 operations. These schemas amount to simple - // copies to row or column storage. These simple schemas may be used - // by level-3 operations, but they should never be used for matrices - // with structure (since they do not densify). - // The BLIS_PACKED_ROW_PANELS and BLIS_PACKED_COL_PANELS schemas are - // used only in level-3 operations. They pack to (typically) skinny - // row and column panels, where the width of the panel is determined - // by register blocksizes. It is assumed that matrices with structure - // will be densified. + cntl_t* cntl; + packm_params_t* params; - // Create control trees to pack by rows. - packm_cntl_row - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to rows: - BLIS_VF, // used for m dimension - BLIS_VF, // used for n dimension - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_ROWS, - BLIS_BUFFER_FOR_GEN_USE ); + // Allocate a packm_params_t struct. + params = bli_malloc_intl( sizeof( packm_params_t ) ); + // Initialize the packm_params_t struct. + params->size = sizeof( packm_params_t ); + params->var_func = packm_var_func; + params->bmid_m = bmid_m; + params->bmid_n = bmid_n; + params->does_invert_diag = does_invert_diag; + params->rev_iter_if_upper = rev_iter_if_upper; + params->rev_iter_if_lower = rev_iter_if_lower; + params->pack_schema = pack_schema; + params->pack_buf_type = pack_buf_type; - // Create control trees to pack by columns. - packm_cntl_col - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to columns: - BLIS_VF, // used for m dimension - BLIS_VF, // used for n dimension - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_COLUMNS, - BLIS_BUFFER_FOR_GEN_USE ); - - - // Set defaults when we don't care whether the packing is by rows or - // by columns. - packm_cntl = packm_cntl_col; -} - -void bli_packm_cntl_finalize() -{ - bli_cntl_obj_free( packm_cntl_row ); - bli_cntl_obj_free( packm_cntl_col ); -} - -packm_t* bli_packm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ) -{ - packm_t* cntl; - - cntl = ( packm_t* ) bli_malloc_intl( sizeof(packm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bmid_m = bmid_m; - cntl->bmid_n = bmid_n; - cntl->does_invert_diag = does_invert_diag; - cntl->rev_iter_if_upper = rev_iter_if_upper; - cntl->rev_iter_if_lower = rev_iter_if_lower; - cntl->pack_schema = pack_schema; - cntl->pack_buf_type = pack_buf_type; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); return cntl; } -void bli_packm_cntl_obj_init( packm_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bmid_m = bmid_m; - cntl->bmid_n = bmid_n; - cntl->does_invert_diag = does_invert_diag; - cntl->rev_iter_if_upper = rev_iter_if_upper; - cntl->rev_iter_if_lower = rev_iter_if_lower; - cntl->pack_schema = pack_schema; - cntl->pack_buf_type = pack_buf_type; -} - diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 1dc31c543..057a512ed 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -32,56 +32,65 @@ */ -struct packm_s +struct packm_params_s { - impl_t impl_type; - varnum_t var_num; - bszid_t bmid_m; - bszid_t bmid_n; - bool_t does_invert_diag; - bool_t rev_iter_if_upper; - bool_t rev_iter_if_lower; - pack_t pack_schema; - packbuf_t pack_buf_type; + uint64_t size; // size field must be present and come first. + packm_voft var_func; + bszid_t bmid_m; + bszid_t bmid_n; + bool_t does_invert_diag; + bool_t rev_iter_if_upper; + bool_t rev_iter_if_lower; + pack_t pack_schema; + packbuf_t pack_buf_type; }; -typedef struct packm_s packm_t; +typedef struct packm_params_s packm_params_t; -#define cntl_bmid_m( cntl ) cntl->bmid_m -#define cntl_bmid_n( cntl ) cntl->bmid_n +#define bli_cntl_packm_params_var_func( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->var_func ) -#define cntl_does_invert_diag( cntl ) cntl->does_invert_diag -#define cntl_rev_iter_if_upper( cntl ) cntl->rev_iter_if_upper -#define cntl_rev_iter_if_lower( cntl ) cntl->rev_iter_if_lower -#define cntl_pack_schema( cntl ) cntl->pack_schema -#define cntl_pack_buf_type( cntl ) cntl->pack_buf_type +#define bli_cntl_packm_params_bmid_m( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->bmid_m ) -#define bli_cntl_sub_packm( cntl ) cntl->sub_packm -#define bli_cntl_sub_packm_a( cntl ) cntl->sub_packm_a -#define bli_cntl_sub_packm_a11( cntl ) cntl->sub_packm_a11 -#define bli_cntl_sub_packm_b( cntl ) cntl->sub_packm_b -#define bli_cntl_sub_packm_b11( cntl ) cntl->sub_packm_b11 -#define bli_cntl_sub_packm_c( cntl ) cntl->sub_packm_c -#define bli_cntl_sub_packm_c11( cntl ) cntl->sub_packm_c11 +#define bli_cntl_packm_params_bmid_n( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->bmid_n ) -void bli_packm_cntl_init( void ); -void bli_packm_cntl_finalize( void ); -packm_t* bli_packm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ); -void bli_packm_cntl_obj_init( packm_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ); +#define bli_cntl_packm_params_does_invert_diag( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->does_invert_diag ) + +#define bli_cntl_packm_params_rev_iter_if_upper( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_upper ) + +#define bli_cntl_packm_params_rev_iter_if_lower( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_lower ) + +#define bli_cntl_packm_params_pack_schema( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->pack_schema ) + +#define bli_cntl_packm_params_pack_buf_type( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->pack_buf_type ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_packm_cntl_obj_create + ( + void* var_func, + void* packm_var_func, + bszid_t bmid_m, + bszid_t bmid_n, + bool_t does_invert_diag, + bool_t rev_iter_if_upper, + bool_t rev_iter_if_lower, + pack_t pack_schema, + packbuf_t pack_buf_type, + cntl_t* sub_node + ); diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index d42abfd62..4f570400a 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -52,7 +52,7 @@ void bli_packm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx ); // Initialize the context with the global membrk object. - bli_cntx_set_membrk( bli_mem_global_membrk(), cntx ); + bli_cntx_set_membrk( bli_memsys_global_membrk(), cntx ); } void bli_packm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index c33a0410e..ccf88f3cb 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -35,38 +35,43 @@ #include "blis.h" -void bli_packm_init( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl ) +siz_t bli_packm_init + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ) { // The purpose of packm_init() is to initialize an object P so that // a source object A can be packed into P via one of the packm - // implementations. This initialization includes acquiring a suitable - // block of memory from the memory allocator, if such a block of memory - // has not already been allocated previously. + // implementations. This initialization precedes the acquisition of a + // suitable block of memory from the memory allocator (if such a block + // of memory has not already been allocated previously). - invdiag_t invert_diag; - pack_t schema; - packord_t pack_ord_if_up; - packord_t pack_ord_if_lo; - packbuf_t pack_buf_type; bszid_t bmult_id_m; bszid_t bmult_id_n; - obj_t c; + bool_t does_invert_diag; + bool_t rev_iter_if_upper; + bool_t rev_iter_if_lower; + //pack_t pack_schema; + packbuf_t pack_buf_type; + siz_t size_needed; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_init_check( a, p, cntx ); - // First check if we are to skip this operation because the control tree - // is NULL, and if so, simply alias the object to its packed counterpart. - if ( bli_cntl_is_noop( cntl ) ) - { - bli_obj_alias_to( *a, *p ); - return; - } + // Extract various fields from the control tree. + bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); + rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); + rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); + //pack_schema = bli_cntl_packm_params_pack_schema( cntl ); + pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); +#if 0 // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can alias the object and return. @@ -79,179 +84,150 @@ void bli_packm_init( obj_t* a, if ( bli_obj_pack_schema( *a ) == BLIS_PACKED_UNSPEC ) { bli_obj_alias_to( *a, *p ); - return; + return 0; } - // At this point, we can be assured that cntl is not NULL. Now we check - // if the object has already been packed to the desired schema (as en- - // coded in the control tree). If so, we can alias and return, as above. + // Now we check if the object has already been packed to the desired + // schema (as encoded in the control tree). If so, we can alias and + // return 0. // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED // and thus packing will be called for (but in some cases packing has // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) ) + if ( bli_obj_pack_schema( *a ) == pack_schema ) { bli_obj_alias_to( *a, *p ); - return; + return 0; } +#endif // If the object is marked as being filled with zeros, then we can skip - // the packm operation entirely and alias. Notice that we use pack-aware - // aliasing. This is needed because the object may have been packed in - // a previous iteration, which means the object currently contains the - // mem_t entry of an already-allocated block. bli_obj_alias_for_packing() - // will avoid overwriting that mem_t entry, which means it can be - // properly released later on. + // the packm operation entirely and alias. if ( bli_obj_is_zeros( *a ) ) { - bli_obj_alias_for_packing( *a, *p ); - return; + bli_obj_alias_to( *a, *p ); + return 0; } - // Now, if we are not skipping the pack operation, then the only question - // left is whether we are to typecast matrix a before packing. - if ( bli_obj_datatype( *a ) != bli_obj_target_datatype( *a ) ) - bli_abort(); -/* - { - // Initialize an object c for the intermediate typecast matrix. - bli_packm_init_cast( a, - p, - &c ); - - // Copy/typecast matrix a to matrix c. - bli_copym( a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // matrix serves as a minor optimization. This causes the packm - // implementation to pack directly from matrix a. - bli_obj_alias_to( *a, c ); - } - - - // Extract various fields from the control tree. - pack_buf_type = cntl_pack_buf_type( cntl ); - bmult_id_m = cntl_bmid_m( cntl ); - bmult_id_n = cntl_bmid_n( cntl ); - - // Extract the schema from the context, depending on whether we are + // We now ignore the pack_schema field in the control tree and + // extract the schema from the context, depending on whether we are // preparing to pack a block of A or panel of B. For A and B, we must // obtain the schema from the context since the induced methods reuse // the same control trees used by native execution, and those induced // methods specify the schema used by the current execution phase // within the context (whereas the control tree does not change). + pack_t schema; + if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) { schema = bli_cntx_get_pack_schema_a( cntx ); -//printf( "bli_packm_init: pack schema a = %x\n", schema ); } else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) { schema = bli_cntx_get_pack_schema_b( cntx ); -//printf( "bli_packm_init: pack schema b = %x\n", schema ); } else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { // If we get a request to pack C for some reason, it is likely // not part of an induced method, and so it would be safe (and // necessary) to read the pack schema from the control tree. - schema = cntl_pack_schema( cntl ); -//printf( "bli_packm_init: pack schema c = %x\n", schema ); + schema = bli_cntl_packm_params_pack_schema( cntl ); } // Prepare a few other variables based on properties of the control // tree. - if ( cntl_does_invert_diag( cntl ) ) invert_diag = BLIS_INVERT_DIAG; - else invert_diag = BLIS_NO_INVERT_DIAG; + invdiag_t invert_diag; + packord_t pack_ord_if_up; + packord_t pack_ord_if_lo; - if ( cntl_rev_iter_if_upper( cntl ) ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; - else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; + if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG; + else invert_diag = BLIS_NO_INVERT_DIAG; - if ( cntl_rev_iter_if_lower( cntl ) ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; - else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; + if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; + else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; + + if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; + else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; // Initialize object p for the final packed matrix. - bli_packm_init_pack( invert_diag, - schema, - pack_ord_if_up, - pack_ord_if_lo, - pack_buf_type, - bmult_id_m, - bmult_id_n, - &c, - p, - cntx ); + size_needed + = + bli_packm_init_pack + ( + invert_diag, + schema, + pack_ord_if_up, + pack_ord_if_lo, + bmult_id_m, + bmult_id_n, + a, + p, + cntx + ); - // Now p is ready to be packed. + // Return the size needed for memory allocation of the packed buffer. + return size_needed; } -void bli_packm_init_pack( invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - packbuf_t pack_buf_type, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* c, - obj_t* p, - cntx_t* cntx ) +siz_t bli_packm_init_pack + ( + invdiag_t invert_diag, + pack_t schema, + packord_t pack_ord_if_up, + packord_t pack_ord_if_lo, + bszid_t bmult_id_m, + bszid_t bmult_id_n, + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { - num_t dt = bli_obj_datatype( *c ); - trans_t transc = bli_obj_onlytrans_status( *c ); - dim_t m_c = bli_obj_length( *c ); - dim_t n_c = bli_obj_width( *c ); + num_t dt = bli_obj_datatype( *a ); + trans_t transa = bli_obj_onlytrans_status( *a ); + dim_t m_a = bli_obj_length( *a ); + dim_t n_a = bli_obj_width( *a ); dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); - membrk_t* membrk = bli_cntx_get_membrk( cntx ); - - mem_t* mem_p; dim_t m_p, n_p; dim_t m_p_pad, n_p_pad; siz_t size_p; siz_t elem_size_p; inc_t rs_p, cs_p; inc_t is_p; - void* buf; - // We begin by copying the basic fields of c. We do NOT copy the - // pack_mem entry from c because the entry in p may be cached from - // a previous iteration, and thus we don't want to overwrite it. - bli_obj_alias_for_packing( *c, *p ); + // We begin by copying the fields of A. + bli_obj_alias_to( *a, *p ); // Update the dimension fields to explicitly reflect a transposition, // if needed. // Then, clear the conjugation and transposition fields from the object // since matrix packing in BLIS is deemed to take care of all conjugation // and transposition necessary. - // Then, we adjust the properties of p when c needs a transposition. - // We negate the diagonal offset, and if c is upper- or lower-stored, - // we either toggle the uplo of p. - // Finally, if we mark p as dense since we assume that all matrices, + // Then, we adjust the properties of P when A needs a transposition. + // We negate the diagonal offset, and if A is upper- or lower-stored, + // we either toggle the uplo of P. + // Finally, if we mark P as dense since we assume that all matrices, // regardless of structure, will be densified. - bli_obj_set_dims_with_trans( transc, m_c, n_c, *p ); + bli_obj_set_dims_with_trans( transa, m_a, n_a, *p ); bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, *p ); - if ( bli_does_trans( transc ) ) + if ( bli_does_trans( transa ) ) { bli_obj_negate_diag_offset( *p ); - if ( bli_obj_is_upper_or_lower( *c ) ) + if ( bli_obj_is_upper_or_lower( *a ) ) bli_obj_toggle_uplo( *p ); } - // If we are packing micro-panels, mark p as dense. Otherwise, we are + // If we are packing micro-panels, mark P as dense. Otherwise, we are // probably being called in the context of a level-2 operation, in - // which case we do not want to overwrite the uplo field of p (inherited - // from c) with BLIS_DENSE because that information may be needed by + // which case we do not want to overwrite the uplo field of P (inherited + // from A) with BLIS_DENSE because that information may be needed by // the level-2 operation's unblocked variant to decide whether to // execute a "lower" or "upper" branch of code. if ( bli_is_panel_packed( schema ) ) @@ -265,7 +241,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, // Set the invert diagonal field. bli_obj_set_invert_diag( invert_diag, *p ); - // Set the pack status of p to the pack schema prescribed in the control + // Set the pack status of P to the pack schema prescribed in the control // tree node. bli_obj_set_pack_schema( schema, *p ); @@ -273,15 +249,11 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_obj_set_pack_order_if_upper( pack_ord_if_up, *p ); bli_obj_set_pack_order_if_lower( pack_ord_if_lo, *p ); - // Extract the address of the mem_t object within p that will track - // properties of the packed buffer. - mem_p = bli_obj_pack_mem( *p ); - // Compute the dimensions padded by the dimension multiples. These // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. - // We compute them by starting with the effective dimensions of c (now - // in p) and aligning them to the dimension multiples (typically equal + // We compute them by starting with the effective dimensions of A (now + // in P) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. m_p = bli_obj_length( *p ); @@ -295,9 +267,9 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_obj_set_padded_dims( m_p_pad, n_p_pad, *p ); // Now we prepare to compute strides, align them, and compute the - // total number of bytes needed for the packed buffer. After that, - // we will acquire an appropriate block of memory from the memory - // allocator. + // total number of bytes needed for the packed buffer. The caller + // will then use that value to acquire an appropriate block of memory + // from the memory allocator. // Extract the element size for the packed object. elem_size_p = bli_obj_elem_size( *p ); @@ -320,7 +292,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, rs_p = bli_align_dim_to_size( rs_p, elem_size_p, BLIS_HEAP_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides in P. bli_obj_set_strides( rs_p, cs_p, *p ); // Compute the size of the packed buffer. @@ -343,7 +315,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, cs_p = bli_align_dim_to_size( cs_p, elem_size_p, BLIS_HEAP_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides in P. bli_obj_set_strides( rs_p, cs_p, *p ); // Compute the size of the packed buffer. @@ -431,7 +403,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel ); else is_p = 1; - // Store the strides and panel dimension in p. + // Store the strides and panel dimension in P. bli_obj_set_strides( rs_p, cs_p, *p ); bli_obj_set_imag_stride( is_p, *p ); bli_obj_set_panel_dim( m_panel, *p ); @@ -524,7 +496,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel ); else is_p = 1; - // Store the strides and panel dimension in p. + // Store the strides and panel dimension in P. bli_obj_set_strides( rs_p, cs_p, *p ); bli_obj_set_imag_stride( is_p, *p ); bli_obj_set_panel_dim( n_panel, *p ); @@ -547,99 +519,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, size_p = 0; } - - if ( bli_mem_is_unalloc( mem_p ) ) - { - // If the mem_t object of p has not yet been allocated, then acquire - // a memory block of type pack_buf_type. - bli_membrk_acquire_m( membrk, - size_p, - pack_buf_type, - mem_p ); - } - else - { - // If the mem_t object is currently allocated and smaller than is - // needed, then it must have been allocated for a different type - // of object (a different pack_buf_type value), so we must first - // release it and then re-acquire it using the new size and new - // pack_buf_type value. - if ( bli_mem_size( mem_p ) < size_p ) - { - bli_membrk_release( mem_p ); - bli_membrk_acquire_m( membrk, - size_p, - pack_buf_type, - mem_p ); - } - } - - // Grab the buffer address from the mem_t object and copy it to the - // main object buffer field. (Sometimes this buffer address will be - // copied when the value is already up-to-date, because it persists - // in the main object buffer field across loop iterations.) - buf = bli_mem_buffer( mem_p ); - bli_obj_set_buffer( buf, *p ); - + return size_p; } -void bli_packm_release( obj_t* p, - packm_t* cntl ) -{ - if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); -} - - -/* -void bli_packm_init_cast( obj_t* a, - obj_t* p, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) the view offset of c is reset to (0,0), - // (3) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available, (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - // (4) object c is marked as being stored in a standard, contiguous - // format (ie: a column-major order). - // Any transposition encoded within object a will not be handled here, - // but rather will be handled in the packm implementation. That way, - // the only thing castm needs to do is cast. - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t m_a = bli_obj_length( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - inc_t rs_c, cs_c; - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect column-major order - // storage. We start the leading dimension out as m(a) and increment it if - // necessary so that the beginning of each column is aligned. - cs_c = bli_align_dim_to_size( m_a, elem_size_c, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - rs_c = 1; - bli_obj_set_strides( rs_c, cs_c, *c ); -} -*/ - diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index a21956ba2..fe0de52fc 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -32,28 +32,24 @@ */ -void bli_packm_init( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl ); +siz_t bli_packm_init + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ); -void bli_packm_init_pack( invdiag_t invert_diag, - pack_t pack_schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - packbuf_t pack_buf_type, - bszid_t mr_id, - bszid_t nr_id, - obj_t* c, - obj_t* p, - cntx_t* cntx ); - -/* -void bli_packm_init_cast( obj_t* a, - obj_t* p, - obj_t* c ); -*/ - -void bli_packm_release( obj_t* p, - packm_t* cntl ); +siz_t bli_packm_init_pack + ( + invdiag_t invert_diag, + pack_t schema, + packord_t pack_ord_if_up, + packord_t pack_ord_if_lo, + bszid_t bmult_id_m, + bszid_t bmult_id_n, + obj_t* a, + obj_t* p, + cntx_t* cntx + ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 7d55c2a64..d36919c33 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -34,33 +34,16 @@ #include "blis.h" -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ); - -static FUNCPTR_T vars[6][3] = +void bli_packm_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // unblocked optimized unblocked blocked - { bli_packm_unb_var1, NULL, bli_packm_blk_var1 }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, -}; - -void bli_packm_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl, - thrinfo_t* thread ) -{ - varnum_t n; - impl_t i; - FUNCPTR_T f; + packm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -70,14 +53,6 @@ void bli_packm_int( obj_t* a, // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); - // First check if we are to skip this operation because the control tree - // is NULL. We return without taking any action because a was already - // aliased to p in packm_init(). - if ( bli_cntl_is_noop( cntl ) ) - { - return; - } - // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can return, since by now aliasing has already @@ -101,7 +76,7 @@ void bli_packm_int( obj_t* a, // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) ) + if ( bli_obj_pack_schema( *a ) == bli_cntl_packm_params_pack_schema( cntl ) ) { return; } @@ -113,21 +88,20 @@ void bli_packm_int( obj_t* a, return; } - - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_packm_params_var_func( cntl ); // Invoke the variant with kappa_use. - f( a, - p, - cntx, - thread ); + f + ( + a, + p, + cntx, + cntl, + thread + ); - // Barrier so that packing is done before computation - bli_thread_obarrier( thread ); + // Barrier so that packing is done before computation. + bli_thread_obarrier( thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 89bd4f0d5..14d006d28 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -32,9 +32,11 @@ */ -void bli_packm_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl, - thrinfo_t* thread ); - +void bli_packm_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c index 47f0dc362..1c1265661 100644 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ b/frame/1m/packm/bli_packm_thrinfo.c @@ -41,7 +41,8 @@ thrinfo_t* bli_packm_thrinfo_create thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ) { thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); @@ -53,9 +54,8 @@ thrinfo_t* bli_packm_thrinfo_create icomm, icomm_id, n_way, work_id, - NULL, - NULL, - NULL + FALSE, + sub_node ); return thread; @@ -69,7 +69,8 @@ void bli_packm_thrinfo_init thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ) { bli_thrinfo_init @@ -78,9 +79,8 @@ void bli_packm_thrinfo_init ocomm, ocomm_id, icomm, icomm_id, n_way, work_id, - NULL, - NULL, - NULL + FALSE, + sub_node ); } @@ -95,7 +95,8 @@ void bli_packm_thrinfo_init_single &BLIS_SINGLE_COMM, 0, &BLIS_SINGLE_COMM, 0, 1, - 0 + 0, + NULL ); } diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 45ab46c3c..7b6d7ae4d 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -49,7 +49,8 @@ thrinfo_t* bli_packm_thrinfo_create thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ); void bli_packm_thrinfo_init @@ -60,7 +61,8 @@ void bli_packm_thrinfo_init thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c index 75e999320..49b3a918a 100644 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ b/frame/1m/packm/bli_packm_unb_var1.c @@ -55,10 +55,14 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); -void bli_packm_unb_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* thread ) +void bli_packm_unb_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_cp = bli_obj_datatype( *c ); diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h index 3d737d483..cefd4de94 100644 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ b/frame/1m/packm/bli_packm_unb_var1.h @@ -32,10 +32,14 @@ */ -void bli_packm_unb_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* thread ); +void bli_packm_unb_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); #undef GENTPROT diff --git a/frame/cntl/bli_cntl.c b/frame/1m/scalm/bli_scalm.h similarity index 95% rename from frame/cntl/bli_cntl.c rename to frame/1m/scalm/bli_scalm.h index ffd6120c8..303ec3860 100644 --- a/frame/cntl/bli_cntl.c +++ b/frame/1m/scalm/bli_scalm.h @@ -32,9 +32,5 @@ */ -#include "blis.h" +#include "bli_scalm_cntl.h" -void bli_cntl_obj_free( void* cntl ) -{ - bli_free_intl( cntl ); -} diff --git a/frame/1m/scalm/bli_scalm_cntl.c b/frame/1m/scalm/bli_scalm_cntl.c index 4a965b3fa..f6008a9a3 100644 --- a/frame/1m/scalm/bli_scalm_cntl.c +++ b/frame/1m/scalm/bli_scalm_cntl.c @@ -34,38 +34,25 @@ #include "blis.h" -scalm_t* scalm_cntl = NULL; - -void bli_scalm_cntl_init() +cntl_t* bli_scalm_cntl_obj_create + ( + void* var_func, + cntl_t* sub_node + ) { - scalm_cntl = bli_scalm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1 ); -} + cntl_t* cntl; -void bli_scalm_cntl_finalize() -{ - bli_cntl_obj_free( scalm_cntl ); -} - - -scalm_t* bli_scalm_cntl_obj_create( impl_t impl_type, - varnum_t var_num ) -{ - scalm_t* cntl; - - cntl = ( scalm_t* ) bli_malloc_intl( sizeof(scalm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + NULL, + sub_node + ); return cntl; } - -void bli_scalm_cntl_obj_init( scalm_t* cntl, - impl_t impl_type, - varnum_t var_num ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; -} - diff --git a/frame/1m/scalm/bli_scalm_cntl.h b/frame/1m/scalm/bli_scalm_cntl.h index ccda9217e..4029a4f10 100644 --- a/frame/1m/scalm/bli_scalm_cntl.h +++ b/frame/1m/scalm/bli_scalm_cntl.h @@ -32,20 +32,9 @@ */ -struct scalm_s -{ - impl_t impl_type; - varnum_t var_num; -}; -typedef struct scalm_s scalm_t; - -#define bli_cntl_sub_scalm( cntl ) cntl->sub_scalm - -void bli_scalm_cntl_init( void ); -void bli_scalm_cntl_finalize( void ); -scalm_t* bli_scalm_cntl_obj_create( impl_t impl_type, - varnum_t var_num ); -void bli_scalm_cntl_obj_init( scalm_t* cntl, - impl_t impl_type, - varnum_t var_num ); +cntl_t* bli_scalm_cntl_obj_create + ( + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/1m/scalm/bli_scalm_int.c b/frame/1m/scalm/other/bli_scalm_int.c similarity index 100% rename from frame/1m/scalm/bli_scalm_int.c rename to frame/1m/scalm/other/bli_scalm_int.c diff --git a/frame/1m/scalm/bli_scalm_int.h b/frame/1m/scalm/other/bli_scalm_int.h similarity index 100% rename from frame/1m/scalm/bli_scalm_int.h rename to frame/1m/scalm/other/bli_scalm_int.h diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index 8254f5043..e300cb66f 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -37,8 +37,7 @@ #include "bli_unpackm_int.h" #include "bli_unpackm_unb_var1.h" -//#include "bli_unpackm_blk_var1.h" -#include "bli_unpackm_blk_var2.h" +#include "bli_unpackm_blk_var1.h" #include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c similarity index 96% rename from frame/1m/unpackm/bli_unpackm_blk_var2.c rename to frame/1m/unpackm/bli_unpackm_blk_var1.c index ab2c2cf1c..bb9f0ee22 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c @@ -52,13 +52,17 @@ typedef void (*FUNCPTR_T)( cntx_t* cntx ); -static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var2); +static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1); -void bli_unpackm_blk_var2( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_blk_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -266,5 +270,5 @@ void PASTEMAC(ch,varname) \ \ } -INSERT_GENTFUNC_BASIC0( unpackm_blk_var2 ) +INSERT_GENTFUNC_BASIC0( unpackm_blk_var1 ) diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h new file mode 100644 index 000000000..330e9b089 --- /dev/null +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_unpackm_blk_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); + + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + dim_t m, \ + dim_t n, \ + dim_t m_panel, \ + dim_t n_panel, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROT_BASIC( unpackm_blk_var1 ) + diff --git a/frame/1m/unpackm/bli_unpackm_check.c b/frame/1m/unpackm/bli_unpackm_check.c index 87af08f43..0ffa984b2 100644 --- a/frame/1m/unpackm/bli_unpackm_check.c +++ b/frame/1m/unpackm/bli_unpackm_check.c @@ -34,10 +34,12 @@ #include "blis.h" -void bli_unpackm_check( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_int_check + ( + obj_t* p, + obj_t* a, + cntx_t* cntx + ) { err_t e_val; diff --git a/frame/1m/unpackm/bli_unpackm_check.h b/frame/1m/unpackm/bli_unpackm_check.h index 217b03c4a..889dd7831 100644 --- a/frame/1m/unpackm/bli_unpackm_check.h +++ b/frame/1m/unpackm/bli_unpackm_check.h @@ -32,7 +32,10 @@ */ -void bli_unpackm_check( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ); +void bli_unpackm_int_check + ( + obj_t* p, + obj_t* a, + cntx_t* cntx + ); + diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 0e99bb741..2900cb3b8 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -34,42 +34,35 @@ #include "blis.h" -unpackm_t* unpackm_cntl = NULL; - -void bli_unpackm_cntl_init() +cntl_t* bli_unpackm_cntl_obj_create + ( + void* var_func, + void* unpackm_var_func, + cntl_t* sub_node + ) { - unpackm_cntl = bli_unpackm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, - NULL ); // no blocksize needed -} + cntl_t* cntl; + unpackm_params_t* params; -void bli_unpackm_cntl_finalize() -{ - bli_cntl_obj_free( unpackm_cntl ); -} + // Allocate an unpackm_params_t struct. + params = bli_malloc_intl( sizeof( unpackm_params_t ) ); -unpackm_t* bli_unpackm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - blksz_t* b ) -{ - unpackm_t* cntl; + // Initialize the unpackm_params_t struct. + params->size = sizeof( unpackm_params_t ); + params->var_func = unpackm_var_func; - cntl = ( unpackm_t* ) bli_malloc_intl( sizeof(unpackm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->b = b; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); return cntl; } -void bli_unpackm_cntl_obj_init( unpackm_t* cntl, - impl_t impl_type, - varnum_t var_num, - blksz_t* b ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->b = b; -} - diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 8a3935ba4..82d9727fc 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -32,28 +32,23 @@ */ -struct unpackm_s +struct unpackm_params_s { - impl_t impl_type; - varnum_t var_num; - blksz_t* b; + uint64_t size; // size field must be present and come first. + unpackm_voft var_func; }; -typedef struct unpackm_s unpackm_t; +typedef struct unpackm_params_s unpackm_params_t; -#define bli_cntl_sub_unpackm( cntl ) cntl->sub_unpackm -#define bli_cntl_sub_unpackm_a( cntl ) cntl->sub_unpackm_a -#define bli_cntl_sub_unpackm_a11( cntl ) cntl->sub_unpackm_a11 -#define bli_cntl_sub_unpackm_b( cntl ) cntl->sub_unpackm_b -#define bli_cntl_sub_unpackm_b11( cntl ) cntl->sub_unpackm_b11 -#define bli_cntl_sub_unpackm_c( cntl ) cntl->sub_unpackm_c -#define bli_cntl_sub_unpackm_c11( cntl ) cntl->sub_unpackm_c11 +#define bli_cntl_unpackm_params_var_func( cntl ) \ +\ + ( ( (unpackm_params_t*)(cntl)->params )->var_func ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_unpackm_cntl_obj_create + ( + void* var_func, + void* unpackm_var_func, + cntl_t* sub_node + ); -void bli_unpackm_cntl_init( void ); -void bli_unpackm_cntl_finalize( void ); -unpackm_t* bli_unpackm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - blksz_t* b ); -void bli_unpackm_cntl_obj_init( unpackm_t* cntl, - impl_t impl_type, - varnum_t var_num, - blksz_t* b ); diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/unpackm/bli_unpackm_cxk.c index a31a7f9dc..0ffaa78e5 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.c +++ b/frame/1m/unpackm/bli_unpackm_cxk.c @@ -152,15 +152,16 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTEMAC(ch,opname)( \ - conj_t conjp, \ - dim_t m, \ - dim_t n, \ - void* beta, \ - void* p, inc_t ldp, \ - void* a, inc_t inca, inc_t lda, \ - cntx_t* cntx \ - ) \ +void PASTEMAC(ch,opname) \ + ( \ + conj_t conjp, \ + dim_t m, \ + dim_t n, \ + void* beta, \ + void* p, inc_t ldp, \ + void* a, inc_t inca, inc_t lda, \ + cntx_t* cntx \ + ) \ { \ dim_t panel_dim; \ num_t dt; \ diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index 62b2b3530..b76d325b9 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -34,188 +34,43 @@ #include "blis.h" -#define FUNCPTR_T unpackm_fp - -typedef void (*FUNCPTR_T)( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ); - -static FUNCPTR_T vars[2][3] = +void bli_unpackm_int + ( + obj_t* p, + obj_t* a, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // unblocked optimized unblocked blocked - { bli_unpackm_unb_var1, NULL, NULL, }, - { NULL, NULL, bli_unpackm_blk_var2, }, -}; + unpackm_voft f; -void bli_unpackm_int( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl, - thrinfo_t* thread ) -{ - // The unpackm operation consists of an optional post-process: castm. - // (This post-process is analogous to the castm pre-process in packm.) - // Here are the following possible ways unpackm can execute: - // 1. unpack and cast: Unpack to a temporary matrix c and then cast - // c to a. - // 2. unpack only: Unpack directly to matrix a since typecasting is - // not needed. - // 3. cast only: Not yet supported / not used. - // 4. no-op: The control tree directs us to skip the unpack operation - // entirely. No action is taken. - - obj_t c; - - varnum_t n; - impl_t i; - FUNCPTR_T f; - - // Sanity check; A should never have a zero dimension. If we must support - // it, then we should fold it into the next alias-and-early-exit block. - //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); - - // First check if we are to skip this operation because the control tree - // is NULL, and if so, simply return. - if ( bli_cntl_is_noop( cntl ) ) - { - return; - } + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_unpackm_int_check( p, a, cntx ); // If p was aliased to a during the pack stage (because it was already // in an acceptable packed/contiguous format), then no unpack is actually // necessary, so we return. - if ( bli_obj_is_alias_of( *p, *a ) ) - { - return; - } + if ( bli_obj_is_alias_of( *p, *a ) ) return; - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_unpackm_check( p, a, cntx, cntl ); - - // Now, if we are not skipping the unpack operation, then the only - // question left is whether we are to typecast matrix a after unpacking. - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) - bli_abort(); -/* - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) - { - // Initialize an object c for the intermediate typecast matrix. - bli_unpackm_init_cast( p, - a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // matrix serves as a minor optimization. This causes the unpackm - // implementation to unpack directly into matrix a. - bli_obj_alias_to( *a, c ); - } - - // Now we are ready to proceed with the unpacking. - - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_unpackm_params_var_func( cntl ); // Invoke the variant. - if( bli_thread_am_ochief( thread ) ) { - f( p, - &c, - cntx, - cntl ); - } - bli_thread_obarrier( thread ); - - // Now, if necessary, we cast the contents of c to matrix a. If casting - // was not necessary, then we are done because the call to the unpackm - // implementation would have unpacked directly to matrix a. -/* - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) + if ( bli_thread_am_ochief( thread ) ) { - // Copy/typecast matrix c to matrix a. - // NOTE: Here, we use copynzm instead of copym because, in the cases - // where we are unpacking/typecasting a real matrix c to a complex - // matrix a, we want to touch only the real components of a, rather - // than also set the imaginary components to zero. This comes about - // because of the fact that, if we are unpacking real-to-complex, - // then it is because all of the computation occurred in the real - // domain, and so we would want to leave whatever imaginary values - // there are in matrix a untouched. Notice that for unpackings that - // entail complex-to-complex data movements, the copynzm operation - // behaves exactly as copym, so no use cases are lost (at least none - // that I can think of). - bli_copynzm( &c, - a ); + f + ( + p, + a, + cntx, + cntl, + thread + ); + } - // NOTE: The above code/comment is outdated. What should happen is - // as follows: - // - If dt(a) is complex and dt(p) is real, then create an alias of - // a and then tweak it so that it looks like a real domain object. - // This will involve: - // - projecting the datatype to real domain - // - scaling both the row and column strides by 2 - // ALL OF THIS should be done in the front-end, NOT here, as - // unpackm() won't even be needed in that case. - } -*/ + // Barrier so that unpacking is done before computation. + bli_thread_obarrier( thread ); } -/* -void bli_unpackm_init_cast( obj_t* p, - obj_t* a, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) the view offset of c is reset to (0,0), - // (3) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available, (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - // (4) object c is marked as being stored in a standard, contiguous - // format (ie: column-major order). - // Any transposition encoded within object a will also be encoded in - // object c. That way, unpackm handles any needed transposition during - // the unpacking, and the only thing the cast stage needs to do is cast. - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t m_a = bli_obj_length( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - - inc_t rs_c, cs_c; - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect column-major order - // storage. We start the leading dimension out as m(a) and increment it if - // necessary so that the beginning of each column is aligned. - cs_c = bli_align_dim_to_size( m_a, elem_size_c, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - rs_c = 1; - bli_obj_set_strides( rs_c, cs_c, *c ); -} -*/ diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h index 6e7a26a13..26cf7877b 100644 --- a/frame/1m/unpackm/bli_unpackm_int.h +++ b/frame/1m/unpackm/bli_unpackm_int.h @@ -32,14 +32,12 @@ */ -void bli_unpackm_int( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl, - thrinfo_t* thread ); +void bli_unpackm_int + ( + obj_t* p, + obj_t* a, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); -/* -void bli_unpackm_init_cast( obj_t* p, - obj_t* a, - obj_t* c ); -*/ diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.c b/frame/1m/unpackm/bli_unpackm_unb_var1.c index 0794f6c4f..9e86a78de 100644 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.c +++ b/frame/1m/unpackm/bli_unpackm_unb_var1.c @@ -50,10 +50,14 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1); -void bli_unpackm_unb_var1( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_unb_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_pc = bli_obj_datatype( *p ); diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.h b/frame/1m/unpackm/bli_unpackm_unb_var1.h index fcb98bda5..40c921522 100644 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.h +++ b/frame/1m/unpackm/bli_unpackm_unb_var1.h @@ -32,10 +32,14 @@ */ -void bli_unpackm_unb_var1( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ); +void bli_unpackm_unb_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ diff --git a/frame/2/gemv/bli_gemv.h b/frame/2/gemv/bli_gemv.h index b7c39613c..b4c6b4816 100644 --- a/frame/2/gemv/bli_gemv.h +++ b/frame/2/gemv/bli_gemv.h @@ -32,9 +32,10 @@ */ -#include "bli_gemv_cntl.h" -#include "bli_gemv_front.h" -#include "bli_gemv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_gemv_cntl.h" +//#include "bli_gemv_front.h" +//#include "bli_gemv_int.h" #include "bli_gemv_var.h" diff --git a/frame/2/gemv/bli_gemv_var.h b/frame/2/gemv/bli_gemv_var.h index 9dd3f5d71..4e2a03908 100644 --- a/frame/2/gemv/bli_gemv_var.h +++ b/frame/2/gemv/bli_gemv_var.h @@ -48,7 +48,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - gemv_t* cntl \ + cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) diff --git a/frame/2/gemv/bli_gemv_var_oapi.c b/frame/2/gemv/bli_gemv_var_oapi.c index 6d27452c2..f1662c922 100644 --- a/frame/2/gemv/bli_gemv_var_oapi.c +++ b/frame/2/gemv/bli_gemv_var_oapi.c @@ -45,7 +45,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - gemv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/gemv/old/bli_gemv_var_oapi.c.prev b/frame/2/gemv/old/bli_gemv_var_oapi.c.prev new file mode 100644 index 000000000..771cfbf12 --- /dev/null +++ b/frame/2/gemv/old/bli_gemv_var_oapi.c.prev @@ -0,0 +1,97 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENFRONT +#define GENFRONT( ftname, opname ) \ +\ +/*static gemv_vft GENARRAY(ftypes,gemv_unb_var1);*/ \ +static GENARRAY_VFP(ftname,opname); \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y, \ + cntx_t* cntx, \ + gemv_t* cntl \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *a ); \ +\ + trans_t transa = bli_obj_conjtrans_status( *a ); \ + conj_t conjx = bli_obj_conj_status( *x ); \ +\ + dim_t m = bli_obj_length( *a ); \ + dim_t n = bli_obj_width( *a ); \ +\ + void* buf_a = bli_obj_buffer_at_off( *a ); \ + inc_t rs_a = bli_obj_row_stride( *a ); \ + inc_t cs_a = bli_obj_col_stride( *a ); \ +\ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_y = bli_obj_buffer_at_off( *y ); \ + inc_t incy = bli_obj_vector_inc( *y ); \ +\ + void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); \ + void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta ); \ +\ + PASTECH(ftname,_vft) f = PASTECH(opname,_vfp)[dt]; \ +\ + /* Invoke the void pointer-based function for the given datatype. */ \ + f( \ + transa, \ + conjx, \ + m, \ + n, \ + buf_alpha, \ + buf_a, rs_a, cs_a, \ + buf_x, incx, \ + buf_beta, \ + buf_y, incy, \ + cntx \ + ); \ +} \ + +GENFRONT( gemv, gemv_unb_var1 ) +GENFRONT( gemv, gemv_unb_var2 ) + +GENFRONT( gemv, gemv_unf_var1 ) +GENFRONT( gemv, gemv_unf_var2 ) + diff --git a/frame/2/gemv/bli_gemv_blk_var1.c b/frame/2/gemv/other/bli_gemv_blk_var1.c similarity index 100% rename from frame/2/gemv/bli_gemv_blk_var1.c rename to frame/2/gemv/other/bli_gemv_blk_var1.c diff --git a/frame/2/gemv/bli_gemv_blk_var2.c b/frame/2/gemv/other/bli_gemv_blk_var2.c similarity index 100% rename from frame/2/gemv/bli_gemv_blk_var2.c rename to frame/2/gemv/other/bli_gemv_blk_var2.c diff --git a/frame/2/gemv/bli_gemv_cntl.c b/frame/2/gemv/other/bli_gemv_cntl.c similarity index 100% rename from frame/2/gemv/bli_gemv_cntl.c rename to frame/2/gemv/other/bli_gemv_cntl.c diff --git a/frame/2/gemv/bli_gemv_cntl.h b/frame/2/gemv/other/bli_gemv_cntl.h similarity index 100% rename from frame/2/gemv/bli_gemv_cntl.h rename to frame/2/gemv/other/bli_gemv_cntl.h diff --git a/frame/2/gemv/bli_gemv_front.c b/frame/2/gemv/other/bli_gemv_front.c similarity index 100% rename from frame/2/gemv/bli_gemv_front.c rename to frame/2/gemv/other/bli_gemv_front.c diff --git a/frame/2/gemv/bli_gemv_front.h b/frame/2/gemv/other/bli_gemv_front.h similarity index 100% rename from frame/2/gemv/bli_gemv_front.h rename to frame/2/gemv/other/bli_gemv_front.h diff --git a/frame/2/gemv/bli_gemv_int.c b/frame/2/gemv/other/bli_gemv_int.c similarity index 100% rename from frame/2/gemv/bli_gemv_int.c rename to frame/2/gemv/other/bli_gemv_int.c diff --git a/frame/2/gemv/bli_gemv_int.h b/frame/2/gemv/other/bli_gemv_int.h similarity index 100% rename from frame/2/gemv/bli_gemv_int.h rename to frame/2/gemv/other/bli_gemv_int.h diff --git a/frame/2/ger/bli_ger.h b/frame/2/ger/bli_ger.h index dc6f9e3f9..1d92502a3 100644 --- a/frame/2/ger/bli_ger.h +++ b/frame/2/ger/bli_ger.h @@ -32,8 +32,9 @@ */ -#include "bli_ger_cntl.h" -#include "bli_ger_front.h" -#include "bli_ger_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_ger_cntl.h" +//#include "bli_ger_front.h" +//#include "bli_ger_int.h" #include "bli_ger_var.h" diff --git a/frame/2/ger/bli_ger_var.h b/frame/2/ger/bli_ger_var.h index 5833ec3f4..98451dcae 100644 --- a/frame/2/ger/bli_ger_var.h +++ b/frame/2/ger/bli_ger_var.h @@ -47,7 +47,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ - ger_t* cntl \ + cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) diff --git a/frame/2/ger/bli_ger_var_oapi.c b/frame/2/ger/bli_ger_var_oapi.c index f03452dce..5c4aa113f 100644 --- a/frame/2/ger/bli_ger_var_oapi.c +++ b/frame/2/ger/bli_ger_var_oapi.c @@ -44,7 +44,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ - ger_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/ger/bli_ger_blk_var1.c b/frame/2/ger/other/bli_ger_blk_var1.c similarity index 100% rename from frame/2/ger/bli_ger_blk_var1.c rename to frame/2/ger/other/bli_ger_blk_var1.c diff --git a/frame/2/ger/bli_ger_blk_var2.c b/frame/2/ger/other/bli_ger_blk_var2.c similarity index 100% rename from frame/2/ger/bli_ger_blk_var2.c rename to frame/2/ger/other/bli_ger_blk_var2.c diff --git a/frame/2/ger/bli_ger_cntl.c b/frame/2/ger/other/bli_ger_cntl.c similarity index 100% rename from frame/2/ger/bli_ger_cntl.c rename to frame/2/ger/other/bli_ger_cntl.c diff --git a/frame/2/ger/bli_ger_cntl.h b/frame/2/ger/other/bli_ger_cntl.h similarity index 100% rename from frame/2/ger/bli_ger_cntl.h rename to frame/2/ger/other/bli_ger_cntl.h diff --git a/frame/2/ger/bli_ger_front.c b/frame/2/ger/other/bli_ger_front.c similarity index 100% rename from frame/2/ger/bli_ger_front.c rename to frame/2/ger/other/bli_ger_front.c diff --git a/frame/2/ger/bli_ger_front.h b/frame/2/ger/other/bli_ger_front.h similarity index 100% rename from frame/2/ger/bli_ger_front.h rename to frame/2/ger/other/bli_ger_front.h diff --git a/frame/2/ger/bli_ger_int.c b/frame/2/ger/other/bli_ger_int.c similarity index 100% rename from frame/2/ger/bli_ger_int.c rename to frame/2/ger/other/bli_ger_int.c diff --git a/frame/2/ger/bli_ger_int.h b/frame/2/ger/other/bli_ger_int.h similarity index 100% rename from frame/2/ger/bli_ger_int.h rename to frame/2/ger/other/bli_ger_int.h diff --git a/frame/2/hemv/bli_hemv.h b/frame/2/hemv/bli_hemv.h index 07b5ff0c0..7ac4b0b13 100644 --- a/frame/2/hemv/bli_hemv.h +++ b/frame/2/hemv/bli_hemv.h @@ -32,9 +32,10 @@ */ -#include "bli_hemv_cntl.h" -#include "bli_hemv_front.h" -#include "bli_hemv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_hemv_cntl.h" +//#include "bli_hemv_front.h" +//#include "bli_hemv_int.h" #include "bli_hemv_var.h" diff --git a/frame/2/hemv/bli_hemv_var.h b/frame/2/hemv/bli_hemv_var.h index cf0e25bd4..db00df441 100644 --- a/frame/2/hemv/bli_hemv_var.h +++ b/frame/2/hemv/bli_hemv_var.h @@ -49,7 +49,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - hemv_t* cntl \ + cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) diff --git a/frame/2/hemv/bli_hemv_var_oapi.c b/frame/2/hemv/bli_hemv_var_oapi.c index c0fc00ad4..a73dbe9b3 100644 --- a/frame/2/hemv/bli_hemv_var_oapi.c +++ b/frame/2/hemv/bli_hemv_var_oapi.c @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - hemv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/hemv/bli_hemv_blk_var1.c b/frame/2/hemv/other/bli_hemv_blk_var1.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var1.c rename to frame/2/hemv/other/bli_hemv_blk_var1.c diff --git a/frame/2/hemv/bli_hemv_blk_var2.c b/frame/2/hemv/other/bli_hemv_blk_var2.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var2.c rename to frame/2/hemv/other/bli_hemv_blk_var2.c diff --git a/frame/2/hemv/bli_hemv_blk_var3.c b/frame/2/hemv/other/bli_hemv_blk_var3.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var3.c rename to frame/2/hemv/other/bli_hemv_blk_var3.c diff --git a/frame/2/hemv/bli_hemv_blk_var4.c b/frame/2/hemv/other/bli_hemv_blk_var4.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var4.c rename to frame/2/hemv/other/bli_hemv_blk_var4.c diff --git a/frame/2/hemv/bli_hemv_cntl.c b/frame/2/hemv/other/bli_hemv_cntl.c similarity index 100% rename from frame/2/hemv/bli_hemv_cntl.c rename to frame/2/hemv/other/bli_hemv_cntl.c diff --git a/frame/2/hemv/bli_hemv_cntl.h b/frame/2/hemv/other/bli_hemv_cntl.h similarity index 100% rename from frame/2/hemv/bli_hemv_cntl.h rename to frame/2/hemv/other/bli_hemv_cntl.h diff --git a/frame/2/hemv/bli_hemv_front.c b/frame/2/hemv/other/bli_hemv_front.c similarity index 100% rename from frame/2/hemv/bli_hemv_front.c rename to frame/2/hemv/other/bli_hemv_front.c diff --git a/frame/2/hemv/bli_hemv_front.h b/frame/2/hemv/other/bli_hemv_front.h similarity index 100% rename from frame/2/hemv/bli_hemv_front.h rename to frame/2/hemv/other/bli_hemv_front.h diff --git a/frame/2/hemv/bli_hemv_int.c b/frame/2/hemv/other/bli_hemv_int.c similarity index 100% rename from frame/2/hemv/bli_hemv_int.c rename to frame/2/hemv/other/bli_hemv_int.c diff --git a/frame/2/hemv/bli_hemv_int.h b/frame/2/hemv/other/bli_hemv_int.h similarity index 100% rename from frame/2/hemv/bli_hemv_int.h rename to frame/2/hemv/other/bli_hemv_int.h diff --git a/frame/2/her/bli_her.h b/frame/2/her/bli_her.h index fe9d2d84e..a9a53d569 100644 --- a/frame/2/her/bli_her.h +++ b/frame/2/her/bli_her.h @@ -32,8 +32,9 @@ */ -#include "bli_her_cntl.h" -#include "bli_her_front.h" -#include "bli_her_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_her_cntl.h" +//#include "bli_her_front.h" +//#include "bli_her_int.h" #include "bli_her_var.h" diff --git a/frame/2/her/bli_her_var.h b/frame/2/her/bli_her_var.h index 3e65e2bc4..d4c11a0b5 100644 --- a/frame/2/her/bli_her_var.h +++ b/frame/2/her/bli_her_var.h @@ -47,7 +47,7 @@ void PASTEMAC0(opname) \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ - her_t* cntl \ + cntl_t* cntl \ ); GENPROT( her_blk_var1 ) diff --git a/frame/2/her/bli_her_var_oapi.c b/frame/2/her/bli_her_var_oapi.c index a49cf62e0..3567de196 100644 --- a/frame/2/her/bli_her_var_oapi.c +++ b/frame/2/her/bli_her_var_oapi.c @@ -44,7 +44,7 @@ void PASTEMAC0(opname) \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ - her_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *c ); \ diff --git a/frame/2/her/bli_her_blk_var1.c b/frame/2/her/other/bli_her_blk_var1.c similarity index 100% rename from frame/2/her/bli_her_blk_var1.c rename to frame/2/her/other/bli_her_blk_var1.c diff --git a/frame/2/her/bli_her_blk_var2.c b/frame/2/her/other/bli_her_blk_var2.c similarity index 100% rename from frame/2/her/bli_her_blk_var2.c rename to frame/2/her/other/bli_her_blk_var2.c diff --git a/frame/2/her/bli_her_cntl.c b/frame/2/her/other/bli_her_cntl.c similarity index 100% rename from frame/2/her/bli_her_cntl.c rename to frame/2/her/other/bli_her_cntl.c diff --git a/frame/2/her/bli_her_cntl.h b/frame/2/her/other/bli_her_cntl.h similarity index 100% rename from frame/2/her/bli_her_cntl.h rename to frame/2/her/other/bli_her_cntl.h diff --git a/frame/2/her/bli_her_front.c b/frame/2/her/other/bli_her_front.c similarity index 100% rename from frame/2/her/bli_her_front.c rename to frame/2/her/other/bli_her_front.c diff --git a/frame/2/her/bli_her_front.h b/frame/2/her/other/bli_her_front.h similarity index 100% rename from frame/2/her/bli_her_front.h rename to frame/2/her/other/bli_her_front.h diff --git a/frame/2/her/bli_her_int.c b/frame/2/her/other/bli_her_int.c similarity index 100% rename from frame/2/her/bli_her_int.c rename to frame/2/her/other/bli_her_int.c diff --git a/frame/2/her/bli_her_int.h b/frame/2/her/other/bli_her_int.h similarity index 100% rename from frame/2/her/bli_her_int.h rename to frame/2/her/other/bli_her_int.h diff --git a/frame/2/her2/bli_her2.h b/frame/2/her2/bli_her2.h index 273b6841e..acf55b7e2 100644 --- a/frame/2/her2/bli_her2.h +++ b/frame/2/her2/bli_her2.h @@ -32,8 +32,9 @@ */ -#include "bli_her2_cntl.h" -#include "bli_her2_front.h" -#include "bli_her2_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_her2_cntl.h" +//#include "bli_her2_front.h" +//#include "bli_her2_int.h" #include "bli_her2_var.h" diff --git a/frame/2/her2/bli_her2_var.h b/frame/2/her2/bli_her2_var.h index 301b6931e..5df14c9d1 100644 --- a/frame/2/her2/bli_her2_var.h +++ b/frame/2/her2/bli_her2_var.h @@ -49,7 +49,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ - her2_t* cntl \ + cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) diff --git a/frame/2/her2/bli_her2_var_oapi.c b/frame/2/her2/bli_her2_var_oapi.c index 6c87496d6..ff345555e 100644 --- a/frame/2/her2/bli_her2_var_oapi.c +++ b/frame/2/her2/bli_her2_var_oapi.c @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ - her2_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *c ); \ diff --git a/frame/2/her2/bli_her2_blk_var1.c b/frame/2/her2/other/bli_her2_blk_var1.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var1.c rename to frame/2/her2/other/bli_her2_blk_var1.c diff --git a/frame/2/her2/bli_her2_blk_var2.c b/frame/2/her2/other/bli_her2_blk_var2.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var2.c rename to frame/2/her2/other/bli_her2_blk_var2.c diff --git a/frame/2/her2/bli_her2_blk_var3.c b/frame/2/her2/other/bli_her2_blk_var3.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var3.c rename to frame/2/her2/other/bli_her2_blk_var3.c diff --git a/frame/2/her2/bli_her2_blk_var4.c b/frame/2/her2/other/bli_her2_blk_var4.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var4.c rename to frame/2/her2/other/bli_her2_blk_var4.c diff --git a/frame/2/her2/bli_her2_cntl.c b/frame/2/her2/other/bli_her2_cntl.c similarity index 100% rename from frame/2/her2/bli_her2_cntl.c rename to frame/2/her2/other/bli_her2_cntl.c diff --git a/frame/2/her2/bli_her2_cntl.h b/frame/2/her2/other/bli_her2_cntl.h similarity index 100% rename from frame/2/her2/bli_her2_cntl.h rename to frame/2/her2/other/bli_her2_cntl.h diff --git a/frame/2/her2/bli_her2_front.c b/frame/2/her2/other/bli_her2_front.c similarity index 100% rename from frame/2/her2/bli_her2_front.c rename to frame/2/her2/other/bli_her2_front.c diff --git a/frame/2/her2/bli_her2_front.h b/frame/2/her2/other/bli_her2_front.h similarity index 100% rename from frame/2/her2/bli_her2_front.h rename to frame/2/her2/other/bli_her2_front.h diff --git a/frame/2/her2/bli_her2_int.c b/frame/2/her2/other/bli_her2_int.c similarity index 100% rename from frame/2/her2/bli_her2_int.c rename to frame/2/her2/other/bli_her2_int.c diff --git a/frame/2/her2/bli_her2_int.h b/frame/2/her2/other/bli_her2_int.h similarity index 100% rename from frame/2/her2/bli_her2_int.h rename to frame/2/her2/other/bli_her2_int.h diff --git a/frame/2/symv/bli_symv.h b/frame/2/symv/bli_symv.h index 5195a4c50..8bb1675dc 100644 --- a/frame/2/symv/bli_symv.h +++ b/frame/2/symv/bli_symv.h @@ -32,5 +32,6 @@ */ -#include "bli_symv_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_symv_front.h" diff --git a/frame/2/symv/bli_symv_front.c b/frame/2/symv/other/bli_symv_front.c similarity index 100% rename from frame/2/symv/bli_symv_front.c rename to frame/2/symv/other/bli_symv_front.c diff --git a/frame/2/symv/bli_symv_front.h b/frame/2/symv/other/bli_symv_front.h similarity index 100% rename from frame/2/symv/bli_symv_front.h rename to frame/2/symv/other/bli_symv_front.h diff --git a/frame/2/syr/bli_syr.h b/frame/2/syr/bli_syr.h index 25a5e0a63..897ebe2c5 100644 --- a/frame/2/syr/bli_syr.h +++ b/frame/2/syr/bli_syr.h @@ -32,5 +32,6 @@ */ -#include "bli_syr_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_syr_front.h" diff --git a/frame/2/syr/bli_syr_front.c b/frame/2/syr/other/bli_syr_front.c similarity index 100% rename from frame/2/syr/bli_syr_front.c rename to frame/2/syr/other/bli_syr_front.c diff --git a/frame/2/syr/bli_syr_front.h b/frame/2/syr/other/bli_syr_front.h similarity index 100% rename from frame/2/syr/bli_syr_front.h rename to frame/2/syr/other/bli_syr_front.h diff --git a/frame/2/syr2/bli_syr2.h b/frame/2/syr2/bli_syr2.h index 39d45c6c5..22a9813ea 100644 --- a/frame/2/syr2/bli_syr2.h +++ b/frame/2/syr2/bli_syr2.h @@ -32,5 +32,6 @@ */ -#include "bli_syr2_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_syr2_front.h" diff --git a/frame/2/syr2/bli_syr2_front.c b/frame/2/syr2/other/bli_syr2_front.c similarity index 100% rename from frame/2/syr2/bli_syr2_front.c rename to frame/2/syr2/other/bli_syr2_front.c diff --git a/frame/2/syr2/bli_syr2_front.h b/frame/2/syr2/other/bli_syr2_front.h similarity index 100% rename from frame/2/syr2/bli_syr2_front.h rename to frame/2/syr2/other/bli_syr2_front.h diff --git a/frame/2/trmv/bli_trmv.h b/frame/2/trmv/bli_trmv.h index 242642a91..8410af719 100644 --- a/frame/2/trmv/bli_trmv.h +++ b/frame/2/trmv/bli_trmv.h @@ -32,9 +32,10 @@ */ -#include "bli_trmv_cntl.h" -#include "bli_trmv_front.h" -#include "bli_trmv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_trmv_cntl.h" +//#include "bli_trmv_front.h" +//#include "bli_trmv_int.h" #include "bli_trmv_var.h" diff --git a/frame/2/trmv/bli_trmv_var.h b/frame/2/trmv/bli_trmv_var.h index cca3be140..23680469e 100644 --- a/frame/2/trmv/bli_trmv_var.h +++ b/frame/2/trmv/bli_trmv_var.h @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trmv_t* cntl \ + cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) diff --git a/frame/2/trmv/bli_trmv_var_oapi.c b/frame/2/trmv/bli_trmv_var_oapi.c index 75926054b..b3c0bc147 100644 --- a/frame/2/trmv/bli_trmv_var_oapi.c +++ b/frame/2/trmv/bli_trmv_var_oapi.c @@ -43,7 +43,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trmv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/trmv/bli_trmv_cntl.c b/frame/2/trmv/other/bli_trmv_cntl.c similarity index 100% rename from frame/2/trmv/bli_trmv_cntl.c rename to frame/2/trmv/other/bli_trmv_cntl.c diff --git a/frame/2/trmv/bli_trmv_cntl.h b/frame/2/trmv/other/bli_trmv_cntl.h similarity index 100% rename from frame/2/trmv/bli_trmv_cntl.h rename to frame/2/trmv/other/bli_trmv_cntl.h diff --git a/frame/2/trmv/bli_trmv_front.c b/frame/2/trmv/other/bli_trmv_front.c similarity index 100% rename from frame/2/trmv/bli_trmv_front.c rename to frame/2/trmv/other/bli_trmv_front.c diff --git a/frame/2/trmv/bli_trmv_front.h b/frame/2/trmv/other/bli_trmv_front.h similarity index 100% rename from frame/2/trmv/bli_trmv_front.h rename to frame/2/trmv/other/bli_trmv_front.h diff --git a/frame/2/trmv/bli_trmv_int.c b/frame/2/trmv/other/bli_trmv_int.c similarity index 100% rename from frame/2/trmv/bli_trmv_int.c rename to frame/2/trmv/other/bli_trmv_int.c diff --git a/frame/2/trmv/bli_trmv_int.h b/frame/2/trmv/other/bli_trmv_int.h similarity index 100% rename from frame/2/trmv/bli_trmv_int.h rename to frame/2/trmv/other/bli_trmv_int.h diff --git a/frame/2/trmv/bli_trmv_l_blk_var1.c b/frame/2/trmv/other/bli_trmv_l_blk_var1.c similarity index 100% rename from frame/2/trmv/bli_trmv_l_blk_var1.c rename to frame/2/trmv/other/bli_trmv_l_blk_var1.c diff --git a/frame/2/trmv/bli_trmv_l_blk_var2.c b/frame/2/trmv/other/bli_trmv_l_blk_var2.c similarity index 100% rename from frame/2/trmv/bli_trmv_l_blk_var2.c rename to frame/2/trmv/other/bli_trmv_l_blk_var2.c diff --git a/frame/2/trmv/bli_trmv_u_blk_var1.c b/frame/2/trmv/other/bli_trmv_u_blk_var1.c similarity index 100% rename from frame/2/trmv/bli_trmv_u_blk_var1.c rename to frame/2/trmv/other/bli_trmv_u_blk_var1.c diff --git a/frame/2/trmv/bli_trmv_u_blk_var2.c b/frame/2/trmv/other/bli_trmv_u_blk_var2.c similarity index 100% rename from frame/2/trmv/bli_trmv_u_blk_var2.c rename to frame/2/trmv/other/bli_trmv_u_blk_var2.c diff --git a/frame/2/trsv/bli_trsv.h b/frame/2/trsv/bli_trsv.h index 7b51ed69a..9d9384422 100644 --- a/frame/2/trsv/bli_trsv.h +++ b/frame/2/trsv/bli_trsv.h @@ -32,9 +32,10 @@ */ -#include "bli_trsv_cntl.h" -#include "bli_trsv_front.h" -#include "bli_trsv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_trsv_cntl.h" +//#include "bli_trsv_front.h" +//#include "bli_trsv_int.h" #include "bli_trsv_var.h" diff --git a/frame/2/trsv/bli_trsv_var.h b/frame/2/trsv/bli_trsv_var.h index bc66f49ff..395d89d5d 100644 --- a/frame/2/trsv/bli_trsv_var.h +++ b/frame/2/trsv/bli_trsv_var.h @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trsv_t* cntl \ + cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) diff --git a/frame/2/trsv/bli_trsv_var_oapi.c b/frame/2/trsv/bli_trsv_var_oapi.c index f38a5123f..e26bb3abd 100644 --- a/frame/2/trsv/bli_trsv_var_oapi.c +++ b/frame/2/trsv/bli_trsv_var_oapi.c @@ -43,7 +43,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trsv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/trsv/bli_trsv_cntl.c b/frame/2/trsv/other/bli_trsv_cntl.c similarity index 100% rename from frame/2/trsv/bli_trsv_cntl.c rename to frame/2/trsv/other/bli_trsv_cntl.c diff --git a/frame/2/trsv/bli_trsv_cntl.h b/frame/2/trsv/other/bli_trsv_cntl.h similarity index 100% rename from frame/2/trsv/bli_trsv_cntl.h rename to frame/2/trsv/other/bli_trsv_cntl.h diff --git a/frame/2/trsv/bli_trsv_front.c b/frame/2/trsv/other/bli_trsv_front.c similarity index 100% rename from frame/2/trsv/bli_trsv_front.c rename to frame/2/trsv/other/bli_trsv_front.c diff --git a/frame/2/trsv/bli_trsv_front.h b/frame/2/trsv/other/bli_trsv_front.h similarity index 100% rename from frame/2/trsv/bli_trsv_front.h rename to frame/2/trsv/other/bli_trsv_front.h diff --git a/frame/2/trsv/bli_trsv_int.c b/frame/2/trsv/other/bli_trsv_int.c similarity index 100% rename from frame/2/trsv/bli_trsv_int.c rename to frame/2/trsv/other/bli_trsv_int.c diff --git a/frame/2/trsv/bli_trsv_int.h b/frame/2/trsv/other/bli_trsv_int.h similarity index 100% rename from frame/2/trsv/bli_trsv_int.h rename to frame/2/trsv/other/bli_trsv_int.h diff --git a/frame/2/trsv/bli_trsv_l_blk_var1.c b/frame/2/trsv/other/bli_trsv_l_blk_var1.c similarity index 100% rename from frame/2/trsv/bli_trsv_l_blk_var1.c rename to frame/2/trsv/other/bli_trsv_l_blk_var1.c diff --git a/frame/2/trsv/bli_trsv_l_blk_var2.c b/frame/2/trsv/other/bli_trsv_l_blk_var2.c similarity index 100% rename from frame/2/trsv/bli_trsv_l_blk_var2.c rename to frame/2/trsv/other/bli_trsv_l_blk_var2.c diff --git a/frame/2/trsv/bli_trsv_u_blk_var1.c b/frame/2/trsv/other/bli_trsv_u_blk_var1.c similarity index 100% rename from frame/2/trsv/bli_trsv_u_blk_var1.c rename to frame/2/trsv/other/bli_trsv_u_blk_var1.c diff --git a/frame/2/trsv/bli_trsv_u_blk_var2.c b/frame/2/trsv/other/bli_trsv_u_blk_var2.c similarity index 100% rename from frame/2/trsv/bli_trsv_u_blk_var2.c rename to frame/2/trsv/other/bli_trsv_u_blk_var2.c diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 13111fd60..ea7926d32 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -33,14 +33,17 @@ */ #include "bli_l3_cntx.h" +#include "bli_l3_cntl.h" #include "bli_l3_check.h" #include "bli_l3_ft.h" #include "bli_l3_oft.h" +#include "bli_l3_voft.h" #include "bli_l3_blocksize.h" #include "bli_l3_direct.h" #include "bli_l3_prune.h" +#include "bli_l3_packm.h" // Prototype object APIs with and without contexts. #include "bli_oapi_w_cntx.h" @@ -68,6 +71,3 @@ #include "bli_trmm3.h" #include "bli_trsm.h" -// Variant object function pointer types. -#include "bli_l3_var_oft.h" - diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 950f13974..630cf03a5 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -34,6 +34,41 @@ #include "blis.h" + +dim_t bli_l3_determine_kc + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* a, + obj_t* b, + bszid_t bszid, + cntx_t* cntx + ) +{ + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) + return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_HERK ) + return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_TRMM ) + return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_TRSM ) + return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + + // This should never execute. + return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); +} + +// ----------------------------------------------------------------------------- + +// +// NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize +// function to determine the kc blocksize so that we can implement the +// "nudging" of kc to be a multiple of mr or nr, as needed. +// + #undef GENFRONT #define GENFRONT( opname, l3op ) \ \ @@ -55,6 +90,7 @@ dim_t PASTEMAC0(opname) \ } GENFRONT( gemm_determine_kc, gemm ) +GENFRONT( herk_determine_kc, trmm ) GENFRONT( trmm_determine_kc, trmm ) GENFRONT( trsm_determine_kc, trsm ) @@ -115,6 +151,8 @@ dim_t PASTEMAC0(opname) \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ } \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -125,6 +163,58 @@ GENFRONT( gemm_determine_kc_b, b ) // ----------------------------------------------------------------------------- +#undef GENFRONT +#define GENFRONT( opname, chdir ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ) \ +{ \ + num_t dt; \ + blksz_t* bsize; \ + dim_t b_alg, b_max; \ + dim_t b_use; \ + \ + /* bli_*_determine_kc_f(): + + We assume that this function is being called from an algorithm that + is moving "forward" (ie: top to bottom, left to right, top-left + to bottom-right). */ \ +\ + /* bli_*_determine_kc_b(): + + We assume that this function is being called from an algorithm that + is moving "backward" (ie: bottom to top, right to left, bottom-right + to top-left). */ \ +\ + /* Extract the execution datatype and use it to query the corresponding + blocksize and blocksize maximum values from the blksz_t object. */ \ + dt = bli_obj_execution_datatype( *a ); \ + bsize = bli_cntx_get_blksz( bszid, cntx ); \ + b_alg = bli_blksz_get_def( dt, bsize ); \ + b_max = bli_blksz_get_max( dt, bsize ); \ +\ + /* Notice that for herk, we do not need to perform any special handling + for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ +\ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ + b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ +\ + return b_use; \ +} + +GENFRONT( herk_determine_kc_f, f ) +GENFRONT( herk_determine_kc_b, b ) + +// ----------------------------------------------------------------------------- + #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ @@ -174,6 +264,8 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -231,6 +323,8 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -239,282 +333,3 @@ dim_t PASTEMAC0(opname) \ GENFRONT( trsm_determine_kc_f, f ) GENFRONT( trsm_determine_kc_b, b ) - - - - - - - - - -#if 0 -dim_t bli_gemm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if A is Hermitian or symmetric, or NR if B is - // Hermitian or symmetric. If neither case applies, then we leave - // the blocksizes unchanged. - if ( bli_obj_root_is_herm_or_symm( *a ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - else if ( bli_obj_root_is_herm_or_symm( *b ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_gemm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if A is Hermitian or symmetric, or NR if B is - // Hermitian or symmetric. If neither case applies, then we leave - // the blocksizes unchanged. - if ( bli_obj_root_is_herm_or_symm( *a ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - else if ( bli_obj_root_is_herm_or_symm( *b ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -// ----------------------------------------------------------------------------- - -dim_t bli_trmm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if the triangular matrix is on the left, or NR - // if the triangular matrix is one the right. - if ( bli_obj_root_is_triangular( *a ) ) - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - else - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_trmm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if the triangular matrix is on the left, or NR - // if the triangular matrix is one the right. - if ( bli_obj_root_is_triangular( *a ) ) - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - else - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -// ----------------------------------------------------------------------------- - -dim_t bli_trsm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR. We always use MR (rather than sometimes using NR) - // because even when the triangle is on the right, packing of that - // matrix uses MR, since only left-side trsm micro-kernels are - // supported. - mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mr ); - b_max = bli_align_dim_to_mult( b_max, mr ); - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_trsm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR. We always use MR (rather than sometimes using NR) - // because even when the triangle is on the right, packing of that - // matrix uses MR, since only left-side trsm micro-kernels are - // supported. - mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mr ); - b_max = bli_align_dim_to_mult( b_max, mr ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -#endif diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 5898186b1..8f9f7ad80 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -47,7 +47,10 @@ dim_t PASTEMAC0(opname) \ cntx_t* cntx \ ); +GENPROT( l3_determine_kc ) + GENPROT( gemm_determine_kc ) +GENPROT( herk_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) @@ -68,6 +71,9 @@ dim_t PASTEMAC0(opname) \ GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) +GENPROT( herk_determine_kc_f ) +GENPROT( herk_determine_kc_b ) + GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 48249a9b3..e901f2766 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -226,29 +226,6 @@ void bli_syr2k_check bli_check_error_code( e_val ); } -#if 0 -void bli_trmm_check - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx - ) -{ - err_t e_val; - - // Perform checks common to hemm/symm. - - bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); - - // Check object structure. - - e_val = bli_check_triangular_object( a ); - bli_check_error_code( e_val ); -} -#endif - void bli_trmm_check ( side_t side, diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c new file mode 100644 index 000000000..a8dfee1ba --- /dev/null +++ b/frame/3/bli_l3_cntl.c @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +void bli_l3_cntl_create_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t** cntl_use + ) +{ + // If the control tree pointer is NULL, we construct a default + // tree as a function of the operation family. + if ( cntl_orig == NULL ) + { + opid_t family = bli_cntx_get_family( cntx ); + + if ( family == BLIS_GEMM || + family == BLIS_HERK || + family == BLIS_TRMM ) + { + *cntl_use = bli_gemm_cntl_create( family ); + } + else // if ( family == BLIS_TRSM ) + { + side_t side; + + if ( bli_obj_is_triangular( *a ) ) side = BLIS_LEFT; + else side = BLIS_RIGHT; + + *cntl_use = bli_trsm_cntl_create( side ); + } + } + else + { + // If the user provided a control tree, create a copy and use it + // instead (so that it can be used to cache things like pack mem_t + // entries). + *cntl_use = bli_cntl_copy( cntl_orig ); + } +} + +void bli_l3_cntl_free_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t* cntl_use, + thrinfo_t* thread + ) +{ + // If the control tree pointer is NULL, a default tree would have + // been created, so we now must free it. + if ( cntl_orig == NULL ) + { + opid_t family = bli_cntx_get_family( cntx ); + + if ( family == BLIS_GEMM || + family == BLIS_HERK || + family == BLIS_TRMM ) + { + bli_gemm_cntl_free( cntl_use, thread ); + } + else // if ( family == BLIS_TRSM ) + { + bli_trsm_cntl_free( cntl_use, thread ); + } + } + else + { + // If the user provided a control tree, free the copy of it that + // was created. + bli_cntl_free( cntl_use, thread ); + } +} + diff --git a/frame/cntl/bli_cntl_init.h b/frame/3/bli_l3_cntl.h similarity index 79% rename from frame/cntl/bli_cntl_init.h rename to frame/3/bli_l3_cntl.h index a3fdf6279..dc0aeb869 100644 --- a/frame/cntl/bli_cntl_init.h +++ b/frame/3/bli_l3_cntl.h @@ -32,6 +32,29 @@ */ -void bli_cntl_init( void ); -void bli_cntl_finalize( void ); -bool_t bli_cntl_is_initialized( void ); + +// +// Prototype conditional control tree creation functions. +// + +void bli_l3_cntl_create_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t** cntl_use + ); + +void bli_l3_cntl_free_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t* cntl_use, + thrinfo_t* thread + ); + diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 3d30cea9e..993501541 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -34,6 +34,28 @@ #include "blis.h" +dir_t bli_l3_direct + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ) +{ + // Query the operation family. + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); + else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); + else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c ); + else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c ); + + // This should never execute. + return BLIS_FWD; +} + +// ----------------------------------------------------------------------------- + dir_t bli_gemm_direct ( obj_t* a, diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 28c60c428..7b88ba51f 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -32,6 +32,15 @@ */ +dir_t bli_l3_direct + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ); + +// ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c new file mode 100644 index 000000000..6714022db --- /dev/null +++ b/frame/3/bli_l3_packm.c @@ -0,0 +1,171 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_l3_packm + ( + obj_t* x, + obj_t* x_pack, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + membrk_t* membrk; + packbuf_t pack_buf_type; + mem_t* cntl_mem_p; + siz_t size_needed; + + // Every thread initializes x_pack and determines the size of memory + // block needed (which gets embedded into the otherwise "blank" mem_t + // entry in the control tree node). + size_needed + = + bli_packm_init + ( + x, + x_pack, + cntx, + cntl + ); + + // If zero was returned, no memory needs to be allocated and so we can + // return early. + if ( size_needed == 0 ) return; + + // Query the memory broker from the context. + membrk = bli_cntx_get_membrk( cntx ); + + // Query the pack buffer type from the control tree node. + pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + + // Query the address of the mem_t entry within the control tree node. + cntl_mem_p = bli_cntl_pack_mem( cntl ); + + // Check the mem_t field in the control tree. If it is unallocated, then + // we need to acquire a block from the memory broker and broadcast it to + // all threads in the chief's thread group. + if ( bli_mem_is_unalloc( cntl_mem_p ) ) + { + mem_t* local_mem_p; + mem_t local_mem_s; + + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread acquires a block from the memory broker + // and saves the associated mem_t entry to local_mem_s. + bli_membrk_acquire_m + ( + membrk, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + + // Save the contents of the chief thread's local mem_t entry to the + // mem_t field in this thread's control tree node. + *cntl_mem_p = *local_mem_p; + } + else // ( bli_mem_is_alloc( cntl_mem_p ) ) + { + mem_t* local_mem_p; + mem_t local_mem_s; + + // If the mem_t entry in the control tree does NOT contain a NULL + // buffer, then a block has already been acquired from the memory + // broker and cached in the control tree. + + // BUT, we need to make sure that the mem_t object is not associated + // with a block that is too small given the size of the packed matrix + // that we need, according to the return value from packm_init(). + siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); + + if ( size_needed < cntl_mem_size ) + { + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread releases the existing block associated with + // the mem_t entry in the control tree, and then re-acquires a + // new block, saving the associated mem_t entry to local_mem_s. + bli_membrk_release( cntl_mem_p ); + bli_membrk_acquire_m + ( + membrk, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + + // Save the chief thread's local mem_t entry to the mem_t field in + // this thread's control tree node. + *cntl_mem_p = *local_mem_p; + } + else + { + // If the mem_t entry is already allocated and sufficiently large, + // then we use it as-is. No action is needed, because all threads + // will already have the cached values in their local control + // trees' mem_t entries, currently pointed to by cntl_mem_p. + } + } + + + // Update the buffer address in x_pack to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + bli_obj_set_buffer_to_mem( cntl_mem_p, *x_pack ); + + + // Pack the contents of object x to object x_pack. + bli_packm_int + ( + x, + x_pack, + cntx, + cntl, + thread + ); +} + diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4.h b/frame/3/bli_l3_packm.h similarity index 94% rename from frame/3/gemm/ind/bli_gemm_blk_var4.h rename to frame/3/bli_l3_packm.h index d43f56983..7dc5dfb46 100644 --- a/frame/3/gemm/ind/bli_gemm_blk_var4.h +++ b/frame/3/bli_l3_packm.h @@ -32,13 +32,14 @@ */ -void bli_gemm_blk_var4 +#include "blis.h" + +void bli_l3_packm ( - obj_t* a, - obj_t* b, - obj_t* c, + obj_t* x, + obj_t* x_pack, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index a8c853c56..f908bbb64 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -34,6 +34,86 @@ #include "blis.h" +/* +void bli_l3_prune_unref_mparts_m + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ) +{ + // Query the operation family. + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. + else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); +} +*/ + +#undef GENFRONT +#define GENFRONT( dim ) \ +\ +void PASTEMAC(l3_prune_unref_mparts_,dim) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx \ + ) \ +{ \ + /* Query the operation family. */ \ + opid_t family = bli_cntx_family( cntx ); \ +\ + if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ + else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ +} + +GENFRONT( m ) +GENFRONT( n ) +GENFRONT( k ) + +// ----------------------------------------------------------------------------- + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_prune_unref_mparts_m) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} \ +void PASTEMAC(opname,_prune_unref_mparts_n) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} \ +void PASTEMAC(opname,_prune_unref_mparts_k) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} + +GENFRONT( gemm ) + +// ----------------------------------------------------------------------------- + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index b4870407d..13d661ff1 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -33,6 +33,23 @@ */ +#undef GENPROT +#define GENPROT( dim ) \ +\ +void PASTEMAC(l3_prune_unref_mparts_,dim) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx \ + ); + +GENPROT( m ) +GENPROT( n ) +GENPROT( k ) + +// ----------------------------------------------------------------------------- + #undef GENPROT #define GENPROT( opname, dim ) \ \ @@ -43,6 +60,10 @@ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ obj_t* c \ ); +GENPROT( gemm, m ) +GENPROT( gemm, n ) +GENPROT( gemm, k ) + GENPROT( herk, m ) GENPROT( herk, n ) GENPROT( herk, k ) diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 0bea43e9d..2505d37a4 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -43,9 +43,7 @@ thrinfo_t* bli_l3_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ) { return bli_thrinfo_create @@ -54,9 +52,8 @@ thrinfo_t* bli_l3_thrinfo_create icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + TRUE, + sub_node ); } @@ -69,9 +66,7 @@ void bli_l3_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ) { bli_thrinfo_init @@ -81,9 +76,8 @@ void bli_l3_thrinfo_init icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + TRUE, + sub_node ); } @@ -101,20 +95,30 @@ void bli_l3_thrinfo_free ) { if ( thread == NULL || - thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_HERK_SINGLE_THREADED + thread == &BLIS_PACKM_SINGLE_THREADED || + thread == &BLIS_GEMM_SINGLE_THREADED ) return; - // Free Communicators - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( thread->ocomm ); - if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( thread->icomm ); + thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); - // Free thrinfo chidren - bli_packm_thrinfo_free( thread->opackm ); - bli_packm_thrinfo_free( thread->ipackm ); - bli_l3_thrinfo_free( thread->sub_self ); + // Free the communicators, but only if the current thrinfo_t struct + // is marked as needing them to be freed. The most common example of + // thrinfo_t nodes NOT marked as needing their comms freed are those + // associated with packm thrinfo_t nodes. + if ( bli_thrinfo_needs_free_comms( thread ) ) + { + // The ochief always frees his communicator, and the ichief free its + // communicator if we are at the leaf node. + if ( bli_thread_am_ochief( thread ) ) + bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); + if ( thrinfo_sub_node == NULL && bli_thread_am_ichief( thread ) ) + bli_thrcomm_free( bli_thrinfo_icomm( thread ) ); + } + + // Free all children of the current thrinfo_t. + bli_l3_thrinfo_free( thrinfo_sub_node ); + + // Free the thrinfo_t struct. bli_free_intl( thread ); } @@ -230,14 +234,8 @@ thrinfo_t** bli_l3_thrinfo_create_paths for( int e = 0; e < ir_way; e++ ) { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - + thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); +#if 0 // Macrokernel loops thrinfo_t* ir_info = @@ -309,6 +307,69 @@ thrinfo_t** bli_l3_thrinfo_create_paths jc_comm, jc_comm_id, jc_way, a, pack_jc_out, pack_jc_in, kc_info ); +// assume ic = 2; jr = 4 + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; = 1*2*4*1 + dim_t kc_nt = ic_way * jr_way * ir_way; = 2*4*1 + dim_t ic_nt = jr_way * ir_way; = 4*1 + dim_t jr_nt = ir_way; = 1 + dim_t ir_nt = 1; +#endif + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + // macro-kernel loops + thrinfo_t* ir_info + = + bli_l3_thrinfo_create( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL ); + thrinfo_t* jr_info + = + bli_l3_thrinfo_create( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + ir_info ); + // packa + thrinfo_t* pack_ic_in + = + bli_packm_thrinfo_create( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id, + jr_info ); + // blk_var1 + thrinfo_t* ic_info + = + bli_l3_thrinfo_create( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + pack_ic_in ); + // packb + thrinfo_t* pack_kc_in + = + bli_packm_thrinfo_create( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id, + ic_info ); + // blk_var3 + thrinfo_t* kc_info + = + bli_l3_thrinfo_create( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + pack_kc_in ); + // blk_var2 + thrinfo_t* jc_info + = + bli_l3_thrinfo_create( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + kc_info ); paths[global_comm_id] = jc_info; } diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 887fc9900..7eac72298 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -69,9 +69,7 @@ thrinfo_t* bli_l3_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ); void bli_l3_thrinfo_init @@ -83,9 +81,7 @@ void bli_l3_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single diff --git a/frame/3/bli_l3_voft.h b/frame/3/bli_l3_voft.h new file mode 100644 index 000000000..52210f172 --- /dev/null +++ b/frame/3/bli_l3_voft.h @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_VAR_OFT_H +#define BLIS_L3_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( gemm ) + + +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( trsm ) + + + +#endif + diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 4a0d00c11..817e48cee 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -40,66 +40,35 @@ void bli_gemm_blk_var1 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { - //The s is for "lives on the stack" - obj_t b_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack = NULL; - obj_t* b_pack = NULL; - obj_t* c1_pack = NULL; + obj_t a1, c1; dir_t direct; dim_t i; dim_t b_alg; + dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_gemm_direct( a, b, c ); + direct = bli_l3_direct( a, b, c, cntx ); - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing B. - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_m( a, b, c, cntx ); - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize objects passed into bli_packm_init for A and C - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack B (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_mdim( direct, thread, a, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); + // Determine the current thread's subpartition range. + bli_thread_get_range_mdim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); // Partition along the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - // NOTE: Use of a (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. b_alg = bli_determine_blocksize( direct, i, my_end, a, bli_cntl_bszid( cntl ), cntx ); @@ -108,53 +77,21 @@ void bli_gemm_blk_var1 i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); + bli_gemm_int + ( + &BLIS_ONE, + &a1, + b, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); + bli_thread_ibarrier( thread ); } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } } diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index b27d70a2f..0fceae6e6 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -40,64 +40,35 @@ void bli_gemm_blk_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { - obj_t a_pack_s; - obj_t b1_pack_s, c1_pack_s; - - obj_t b1, c1; - obj_t* a_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c1_pack = NULL; + obj_t b1, c1; dir_t direct; dim_t i; dim_t b_alg; + dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_gemm_direct( a, b, c ); + direct = bli_l3_direct( a, b, c, cntx ); - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A - bli_obj_init_pack( &a_pack_s ); - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_n( a, b, c, cntx ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for B and C that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &b1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_ndim( direct, thread, b, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); + // Determine the current thread's subpartition range. + bli_thread_get_range_ndim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); // Partition along the n dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - // NOTE: Use of b (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. b_alg = bli_determine_blocksize( direct, i, my_end, b, bli_cntl_bszid( cntl ), cntx ); @@ -107,52 +78,20 @@ void bli_gemm_blk_var2 bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a_pack, - b1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); + bli_gemm_int + ( + &BLIS_ONE, + a, + &b1, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); + bli_thread_ibarrier( thread ); } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } } diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index ad5a92ffc..7be9c6a58 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -40,52 +40,23 @@ void bli_gemm_blk_var3 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { - obj_t c_pack_s; - obj_t a1_pack_s, b1_pack_s; - - obj_t a1, b1; - obj_t* a1_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c_pack = NULL; + obj_t a1, b1; dir_t direct; - dim_t i; - dim_t b_alg; - dim_t k_trans; + dim_t i; + dim_t b_alg; + dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_gemm_direct( a, b, c ); + direct = bli_l3_direct( a, b, c, cntx ); - if( bli_thread_am_ochief( thread ) ){ - // Initialize object for packing C - bli_obj_init_pack( &c_pack_s ); - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - // Initialize pack objects for A and B that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ){ - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &b1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_k( a, b, c, cntx ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); @@ -94,11 +65,8 @@ void bli_gemm_blk_var3 for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - // NOTE: We call a gemm/hemm/symm-specific function to determine - // the kc blocksize so that we can implement the "nudging" of kc - // to be a multiple of mr or nr, as needed. - b_alg = bli_gemm_determine_kc( direct, i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, @@ -106,60 +74,43 @@ void bli_gemm_blk_var3 bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a1_pack, - b1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread) ); + bli_gemm_int + ( + &BLIS_ONE, + &a1, + &b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread) + ); + + bli_thread_ibarrier( thread ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); + // And since c is a locally aliased obj_t (see _int() function), we + // can simply overwrite the internal beta scalar with BLIS_ONE once + // it has been used in the first iteration. However... + // Unlike variant 3 of gemm and herk, which reset the internal scalar + // on C at the end of the first iteration so that subsequent iterations + // do not erroneously apply beta more than once, it is important that + // this behavior not be applied to trmm. That is because the order of + // computation is always such that the beta that is passed into the + // macro-kernel must be zero, since the macro-kernel only applies that + // beta to (and thus overwrites) the row-panel of C that corresponds to + // the current block intersecting the diagonal. It turns out that this + // same pattern holds for trmm3 as well--except there, the beta scalar + // is potentially non-zero, but is still applied only to the current + // row-panel of C, and thus beta is applied to all of C exactly once. + // Thus, for neither trmm nor trmm3 should we reset the scalar on C + // after the first iteration. + if ( bli_cntx_get_family( cntx ) != BLIS_TRMM ) + if ( i == 0 ) bli_obj_scalar_reset( c ); } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - } } diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 382b82bbd..3f3773418 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -34,140 +34,101 @@ #include "blis.h" -extern scalm_t* scalm_cntl; - -packm_t* gemm_packa_cntl = NULL; -packm_t* gemm_packb_cntl = NULL; - -gemm_t* gemm_cntl_bp_ke = NULL; -gemm_t* gemm_cntl_op_bp = NULL; -gemm_t* gemm_cntl_mm_op = NULL; -gemm_t* gemm_cntl_vl_mm = NULL; - -gemm_t* gemm_cntl = NULL; - -void bli_gemm_cntl_init() +cntl_t* bli_gemm_cntl_create + ( + opid_t family + ) { - // Create control tree objects for packm operations. - gemm_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MR, - BLIS_KR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - gemm_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_KR, - BLIS_NR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); + void* macro_kernel_p = bli_gemm_ker_var2; - // - // Create a control tree for packing A and B, and streaming C. - // + // Change the macro-kernel if the operation family is herk or trmm. + if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; - // Create control tree object for lowest-level block-panel kernel. - gemm_cntl_bp_ke - = - bli_gemm_cntl_obj_create( BLIS_UNB_OPT, - BLIS_VARIANT2, - 0, // bszid_t not used by macro-kernel - NULL, NULL, NULL, - NULL, NULL, NULL ); + // Create a node for the macro-kernel. + cntl_t* gemm_cntl_bp_ke = bli_gemm_cntl_obj_create + ( + BLIS_NR, // bszid not used by macro-kernel. + macro_kernel_p, + NULL // no sub-node; this is the leaf of the tree. + ); - // Create control tree object for outer panel (to block-panel) - // problem. - gemm_cntl_op_bp - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - gemm_packa_cntl, - gemm_packb_cntl, - NULL, - gemm_cntl_bp_ke, - NULL ); + // Create a node for packing matrix A. + cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_gemm_packa, + bli_packm_blk_var1, + BLIS_MR, + BLIS_KR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + gemm_cntl_bp_ke + ); - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates. - gemm_cntl_mm_op - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - gemm_cntl_op_bp, - NULL ); + // Create a node for partitioning the m dimension by MC. + cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create + ( + BLIS_MC, + bli_gemm_blk_var1, + gemm_cntl_packa + ); - // Create control tree object for very large problem via multiple - // general problems. - gemm_cntl_vl_mm - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - gemm_cntl_mm_op, - NULL ); + // Create a node for packing matrix B. + cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_gemm_packb, + bli_packm_blk_var1, + BLIS_KR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + gemm_cntl_op_bp + ); - // Alias the "master" gemm control tree to a shorter name. - gemm_cntl = gemm_cntl_vl_mm; + // Create a node for partitioning the k dimension by KC. + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + ( + BLIS_KC, + bli_gemm_blk_var3, + gemm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + ( + BLIS_NC, + bli_gemm_blk_var2, + gemm_cntl_mm_op + ); + + return gemm_cntl_vl_mm; } -void bli_gemm_cntl_finalize() +void bli_gemm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) { - bli_cntl_obj_free( gemm_packa_cntl ); - bli_cntl_obj_free( gemm_packb_cntl ); - - bli_cntl_obj_free( gemm_cntl_bp_ke ); - bli_cntl_obj_free( gemm_cntl_op_bp ); - bli_cntl_obj_free( gemm_cntl_mm_op ); - bli_cntl_obj_free( gemm_cntl_vl_mm ); + bli_cntl_free( cntl, thread ); } -gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_packm_a, - packm_t* sub_packm_b, - packm_t* sub_packm_c, - gemm_t* sub_gemm, - unpackm_t* sub_unpackm_c ) +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ) { - gemm_t* cntl; - - cntl = ( gemm_t* ) bli_malloc_intl( sizeof(gemm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bszid = bszid; - cntl->sub_scalm = sub_scalm; - cntl->sub_packm_a = sub_packm_a; - cntl->sub_packm_b = sub_packm_b; - cntl->sub_packm_c = sub_packm_c; - cntl->sub_gemm = sub_gemm; - cntl->sub_unpackm_c = sub_unpackm_c; - - return cntl; + return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 507a1dd14..5b985327c 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -32,31 +32,23 @@ */ -struct gemm_s -{ - impl_t impl_type; - varnum_t var_num; - bszid_t bszid; - struct scalm_s* sub_scalm; - struct packm_s* sub_packm_a; - struct packm_s* sub_packm_b; - struct packm_s* sub_packm_c; - struct gemm_s* sub_gemm; - struct unpackm_s* sub_unpackm_c; -}; -typedef struct gemm_s gemm_t; +cntl_t* bli_gemm_cntl_create + ( + opid_t family + ); -#define bli_cntl_sub_gemm( cntl ) cntl->sub_gemm +void bli_gemm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); -void bli_gemm_cntl_init( void ); -void bli_gemm_cntl_finalize( void ); -gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_pack_a, - packm_t* sub_pack_b, - packm_t* sub_pack_c, - gemm_t* sub_gemm, - unpackm_t* sub_unpack_c ); +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index a2c7be14f..0782d7272 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -42,7 +42,7 @@ void bli_gemm_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t a_local; @@ -62,7 +62,7 @@ void bli_gemm_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -73,7 +73,7 @@ void bli_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, b_local ); @@ -82,22 +82,27 @@ void bli_gemm_front bli_obj_induce_trans( c_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); + + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index fc554196b..9f11f61d4 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -40,5 +40,5 @@ void bli_gemm_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 88324705f..18e531879 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -34,17 +34,6 @@ #include "blis.h" -#if 1 -static gemm_voft vars[4][3] = -{ - // unblocked optimized unblocked blocked - { NULL, NULL, bli_gemm_blk_var1 }, - { NULL, bli_gemm_ker_var2, bli_gemm_blk_var2 }, - { NULL, NULL, bli_gemm_blk_var3 }, - { NULL, NULL, NULL }, -}; -#endif - void bli_gemm_int ( obj_t* alpha, @@ -53,15 +42,13 @@ void bli_gemm_int obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; - varnum_t n; - impl_t i; gemm_voft f; ind_t im; @@ -76,7 +63,7 @@ void bli_gemm_int if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - if( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; @@ -87,32 +74,20 @@ void bli_gemm_int if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { - if( bli_thread_am_ochief( thread ) ) + // This should never execute. + bli_abort(); + + if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } - // Alias A and B in case we need to update attached scalars. + // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); - - // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); - // If we are about to call a leaf-level implementation, and matrix C - // still needs a transposition, then we must induce one by swapping the - // strides and dimensions. Note that this transposition would normally - // be handled explicitly in the packing of C, but if C is not being - // packed, this is our last chance to handle the transposition. - if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) - { - //if( bli_thread_am_ochief( thread ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); - // } - } - // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) @@ -127,24 +102,17 @@ void bli_gemm_int bli_obj_scalar_apply_scalar( beta, &c_local ); } - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; - // Extract the function pointer from the current control tree node. - //f = bli_cntl_sub_prob( cntl ); + f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { - if ( im == BLIS_3M3 && f == bli_gemm_blk_var1 ) f = bli_gemm_blk_var4; - else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var4; - else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var3; + if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; + else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; + else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; } // Invoke the variant. diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/gemm/bli_gemm_int.h index 73e44fecf..e8580cf95 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/gemm/bli_gemm_int.h @@ -40,7 +40,7 @@ void bli_gemm_int obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index d3f7aee5c..b44564387 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -62,7 +62,7 @@ void bli_gemm_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -239,7 +239,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.h b/frame/3/gemm/bli_gemm_packab.c similarity index 63% rename from frame/3/gemm/ind/bli_gemm_ker_var3.h rename to frame/3/gemm/bli_gemm_packab.c index 06f71bc83..c0166c828 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.h +++ b/frame/3/gemm/bli_gemm_packab.c @@ -32,44 +32,79 @@ */ +#include "blis.h" -// -// Prototype object-based interface. -// -void bli_gemm_ker_var3 +void bli_gemm_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread - ); + ) +{ + obj_t a_pack; + // Pack matrix A according to the control tree node. + bli_l3_packm + ( + a, + &a_pack, + cntx, + cntl, + thread + ); -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} -INSERT_GENTPROT_BASIC( gemm_ker_var3 ) +// ----------------------------------------------------------------------------- + +void bli_gemm_packb + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b_pack; + + // Pack matrix B according to the control tree node. + bli_l3_packm + ( + b, + &b_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix B. + bli_gemm_int + ( + &BLIS_ONE, + a, + &b_pack, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 11c9dd09d..c66587fda 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -46,20 +46,22 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) +GENPROT( gemm_packa ) +GENPROT( gemm_packb ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: -GENPROT( gemm_blk_var4 ) // 3m3 -GENPROT( gemm_ker_var3 ) // 4m1b -GENPROT( gemm_ker_var4 ) // 3m2 +GENPROT( gemm3m3_packa ) // 3m3 +GENPROT( gemm4mb_ker_var2 ) // 4m1b +GENPROT( gemm3m2_ker_var2 ) // 3m2 // @@ -90,6 +92,6 @@ void PASTEMAC(ch,varname) \ INSERT_GENTPROT_BASIC( gemm_ker_var2 ) // Headers for induced algorithms: -INSERT_GENTPROT_BASIC( gemm_ker_var3 ) // 4m1b -INSERT_GENTPROT_BASIC( gemm_ker_var4 ) // 3m2 +INSERT_GENTPROT_BASIC( gemm4mb_ker_var2 ) // 4m1b +INSERT_GENTPROT_BASIC( gemm3m2_ker_var2 ) // 3m2 diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.c b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c similarity index 96% rename from frame/3/gemm/ind/bli_gemm_ker_var4.c rename to frame/3/gemm/ind/bli_gemm3m2_ker_var2.c index 3ef423c26..ea8904183 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.c +++ b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c @@ -53,16 +53,16 @@ typedef void (*FUNCPTR_T)( thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var4); +static FUNCPTR_T GENARRAY(ftypes,gemm3m2_ker_var2); -void bli_gemm_ker_var4 +void bli_gemm3m2_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -241,7 +241,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ @@ -354,9 +354,9 @@ void PASTEMAC(ch,varname) \ } \ } \ \ -/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var4: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var4: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } -INSERT_GENTFUNC_BASIC0( gemm_ker_var4 ) +INSERT_GENTFUNC_BASIC0( gemm3m2_ker_var2 ) diff --git a/frame/3/gemm/ind/bli_gemm3m3_packa.c b/frame/3/gemm/ind/bli_gemm3m3_packa.c new file mode 100644 index 000000000..f6e92020c --- /dev/null +++ b/frame/3/gemm/ind/bli_gemm3m3_packa.c @@ -0,0 +1,142 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm3m3_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack; + + // Make a copy of the context for each stage. + cntx_t cntx_ro = *cntx; + cntx_t cntx_io = *cntx; + cntx_t cntx_rpi = *cntx; + + // ----------------------------------------------------- + + // Initialize the context for the real-only stage. + bli_gemm3m3_cntx_stage( 0, &cntx_ro ); + + // Pack matrix the real-only part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_ro, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // Only apply beta within the first of three subproblems. + if ( bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c ); + + // ----------------------------------------------------- + + // Initialize the context for the imag-only stage. + bli_gemm3m3_cntx_stage( 1, &cntx_io ); + + // Pack matrix the imag-only part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_io, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // ----------------------------------------------------- + + // Initialize the context for the real+imag stage. + bli_gemm3m3_cntx_stage( 2, &cntx_rpi ); + + // Pack matrix the real+imag part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_rpi, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + +} + diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c similarity index 98% rename from frame/3/gemm/ind/bli_gemm_ker_var3.c rename to frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index f368a02ab..d9d714917 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -53,16 +53,16 @@ typedef void (*FUNCPTR_T)( thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var3); +static FUNCPTR_T GENARRAY(ftypes,gemm4mb_ker_var2); -void bli_gemm_ker_var3 +void bli_gemm4mb_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -241,7 +241,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ @@ -352,5 +352,5 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var3: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } -INSERT_GENTFUNC_BASIC0( gemm_ker_var3 ) +INSERT_GENTFUNC_BASIC0( gemm4mb_ker_var2 ) diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4.c b/frame/3/gemm/ind/bli_gemm_blk_var4.c deleted file mode 100644 index 10a6afa91..000000000 --- a/frame/3/gemm/ind/bli_gemm_blk_var4.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var4 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread - ) -{ - //The s is for "lives on the stack" - obj_t b_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack = NULL; - obj_t* b_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - // Make a copy of the context for each stage. - cntx_t cntx_ro = *cntx; - cntx_t cntx_io = *cntx; - cntx_t cntx_rpi = *cntx; - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing B. - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by - // chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize objects passed into bli_packm_init for A and C - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack B (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: Use of a (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - - // Initialize the context for the real-only stage. - bli_gemm3m3_cntx_stage( 0, &cntx_ro ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_ro, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - &cntx_ro, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_ro, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - &cntx_ro, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (real-only). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Only apply beta within the first of three subproblems. - if ( bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack ); - - - // Initialize the context for the imag-only stage. - bli_gemm3m3_cntx_stage( 1, &cntx_io ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_io, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_io, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (imag-only). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Initialize the context for the real+imag stage. - bli_gemm3m3_cntx_stage( 2, &cntx_rpi ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_rpi, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_rpi, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (real+imag). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - // It doesn't matter which packm cntl node we pass in, as long - // as it is valid, packm_release() will release the mem_t entry - // stored in a1_pack. - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 4b9f082f6..ed7e03b9c 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -43,7 +43,7 @@ void bli_hemm_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t a_local; @@ -63,7 +63,7 @@ void bli_hemm_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -74,7 +74,7 @@ void bli_hemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_toggle_conj( a_local ); @@ -89,22 +89,27 @@ void bli_hemm_front bli_obj_swap( a_local, b_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h index c369d7be2..e1d40c80e 100644 --- a/frame/3/hemm/bli_hemm_front.h +++ b/frame/3/hemm/bli_hemm_front.h @@ -41,5 +41,5 @@ void bli_hemm_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 61d54ca79..f72dedf87 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -42,7 +42,7 @@ void bli_her2k_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t alpha_conj; @@ -67,7 +67,7 @@ void bli_her2k_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -94,7 +94,7 @@ void bli_her2k_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); @@ -107,49 +107,43 @@ void bli_her2k_front bli_obj_induce_trans( c_local ); } -#if 0 - // Invoke the internal back-end. - bli_her2k_int( alpha, - &a_local, - &bh_local, - &alpha_conj, - &b_local, - &ah_local, - beta, - &c_local, - cntl ); -#else + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &bh_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &bh_local, + beta, + &c_local, + cntx, + cntl, + infos + ); - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + &alpha_conj, + &b_local, + &ah_local, + &BLIS_ONE, + &c_local, + cntx, + cntl, + infos + ); - bli_l3_thrinfo_free_paths( infos, n_threads ); - -#endif + bli_l3_thrinfo_free_paths( infos, n_threads ); // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of @@ -158,6 +152,5 @@ void bli_her2k_front // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); - } diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h index 8a2301eb0..6f1246ea6 100644 --- a/frame/3/her2k/bli_her2k_front.h +++ b/frame/3/her2k/bli_her2k_front.h @@ -40,5 +40,5 @@ void bli_her2k_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h index 290b8bda3..d9aebc78b 100644 --- a/frame/3/herk/bli_herk.h +++ b/frame/3/herk/bli_herk.h @@ -33,7 +33,6 @@ */ #include "bli_herk_front.h" -#include "bli_herk_int.h" #include "bli_herk_var.h" diff --git a/frame/3/herk/bli_herk_blk_var1.c b/frame/3/herk/bli_herk_blk_var1.c deleted file mode 100644 index 535e4f845..000000000 --- a/frame/3/herk/bli_herk_blk_var1.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var1 - ( - obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread - ) -{ - obj_t ah_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack; - obj_t* c1_pack; - obj_t* ah_pack; - - dir_t direct; - - dim_t i; - dim_t b_alg; - - // Determine the direction in which to partition (forwards or backwards). - direct = bli_herk_direct( a, ah, c ); - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_m( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A'. - bli_obj_init_pack( &ah_pack_s ); - bli_packm_init( ah, &ah_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - ah_pack = bli_thread_obroadcast( thread, &ah_pack_s ); - - // Initialize pack objects that are passed into packm_init() for A and C. - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A' (if instructed). - bli_packm_int( ah, ah_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_mdim( direct, thread, c, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a1_pack, - ah_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( ah_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_blk_var2.c b/frame/3/herk/bli_herk_blk_var2.c deleted file mode 100644 index 661d875d3..000000000 --- a/frame/3/herk/bli_herk_blk_var2.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var2 - ( - obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread - ) -{ - obj_t a_pack_s; - obj_t ah1_pack_s, c1_pack_s; - - obj_t ah1, c1; - obj_t* a_pack; - obj_t* ah1_pack; - obj_t* c1_pack; - - dir_t direct; - - dim_t i; - dim_t b_alg; - - // Determine the direction in which to partition (forwards or backwards). - direct = bli_herk_direct( a, ah, c ); - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_n( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A - bli_obj_init_pack( &a_pack_s ); - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for C and A' that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &ah1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_ndim( direct, thread, c, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1' and C1. - bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); - bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1' and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ) ; - - // Pack A1' (if instructed). - bli_packm_int( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ) ; - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a_pack, - ah1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_blk_var3.c b/frame/3/herk/bli_herk_blk_var3.c deleted file mode 100644 index 547c4a37f..000000000 --- a/frame/3/herk/bli_herk_blk_var3.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var3 - ( - obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread - ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, ah1_pack_s; - - obj_t a1, ah1; - obj_t* a1_pack = NULL; - obj_t* ah1_pack = NULL; - obj_t* c_pack = NULL; - - dir_t direct; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - // Determine the direction in which to partition (forwards or backwards). - direct = bli_herk_direct( a, ah, c ); - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_k( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing C. - bli_obj_init_pack( &c_pack_s ); - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - // Initialize all pack objects that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &ah1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, k_trans, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and A1'. - bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); - - // Initialize objects for packing A1 and A1'. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a1_pack, - ah1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); - - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) { - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - } - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 201ac45ae..3abfa9baf 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -41,7 +41,7 @@ void bli_herk_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t a_local; @@ -63,7 +63,7 @@ void bli_herk_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -79,7 +79,7 @@ void bli_herk_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_conj( ah_local ); @@ -87,22 +87,28 @@ void bli_herk_front bli_obj_induce_trans( c_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &ah_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &ah_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + + bli_l3_thrinfo_free_paths( infos, n_threads ); // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of @@ -111,6 +117,5 @@ void bli_herk_front // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); - } diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h index 572536493..ef9325969 100644 --- a/frame/3/herk/bli_herk_front.h +++ b/frame/3/herk/bli_herk_front.h @@ -39,5 +39,5 @@ void bli_herk_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 0951337dc..c36b6b826 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -63,7 +63,7 @@ void bli_herk_l_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 2dfec1090..56da59f1a 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -63,7 +63,7 @@ void bli_herk_u_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/herk/bli_herk_var.h index fd68d2fd1..a18c9ab49 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/herk/bli_herk_var.h @@ -46,17 +46,19 @@ void PASTEMAC0(opname) \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( herk_blk_var1 ) -GENPROT( herk_blk_var2 ) -GENPROT( herk_blk_var3 ) +//GENPROT( herk_blk_var1 ) +//GENPROT( herk_blk_var2 ) +//GENPROT( herk_blk_var3 ) GENPROT( herk_x_ker_var2 ) GENPROT( herk_l_ker_var2 ) GENPROT( herk_u_ker_var2 ) +//GENPROT( herk_packa ) +//GENPROT( herk_packb ) // diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/herk/bli_herk_x_ker_var2.c index 4f29cd4d8..71a4cc59b 100644 --- a/frame/3/herk/bli_herk_x_ker_var2.c +++ b/frame/3/herk/bli_herk_x_ker_var2.c @@ -45,7 +45,7 @@ void bli_herk_x_ker_var2 obj_t* ah, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { diff --git a/frame/cntl/bli_cntl_init.c b/frame/3/herk/old/bli_herk_blk_var1.c similarity index 55% rename from frame/cntl/bli_cntl_init.c rename to frame/3/herk/old/bli_herk_blk_var1.c index b7c53ec65..59a20e878 100644 --- a/frame/cntl/bli_cntl_init.c +++ b/frame/3/herk/old/bli_herk_blk_var1.c @@ -34,71 +34,65 @@ #include "blis.h" -static bool_t bli_cntl_is_init = FALSE; - -void bli_cntl_init( void ) +void bli_herk_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // If the API is already initialized, return early. - if ( bli_cntl_is_initialized() ) return; + obj_t a1, c1; - // Level-1 - bli_scalv_cntl_init(); - bli_packv_cntl_init(); - bli_unpackv_cntl_init(); + dir_t direct; - // Level-1m - bli_scalm_cntl_init(); - bli_packm_cntl_init(); - bli_unpackm_cntl_init(); + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; - // Level-2 - bli_gemv_cntl_init(); - bli_ger_cntl_init(); - bli_hemv_cntl_init(); - bli_her_cntl_init(); - bli_her2_cntl_init(); - bli_trmv_cntl_init(); - bli_trsv_cntl_init(); + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); - // Level-3 - bli_gemm_cntl_init(); - bli_trsm_cntl_init(); + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_m( a, b, c ); - // Mark API as initialized. - bli_cntl_is_init = TRUE; -} - -void bli_cntl_finalize( void ) -{ - // Level-1 - bli_scalv_cntl_finalize(); - bli_packv_cntl_finalize(); - bli_unpackv_cntl_finalize(); - - // Level-1m - bli_scalm_cntl_finalize(); - bli_packm_cntl_finalize(); - bli_unpackm_cntl_finalize(); - - // Level-2 - bli_gemv_cntl_finalize(); - bli_ger_cntl_finalize(); - bli_hemv_cntl_finalize(); - bli_her_cntl_finalize(); - bli_her2_cntl_finalize(); - bli_trmv_cntl_finalize(); - bli_trsv_cntl_finalize(); - - // Level-3 - bli_gemm_cntl_finalize(); - bli_trsm_cntl_finalize(); - - // Mark API as uninitialized. - bli_cntl_is_init = FALSE; -} - -bool_t bli_cntl_is_initialized( void ) -{ - return bli_cntl_is_init; + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_mdim + ( + direct, thread, a, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a1, + b, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } } diff --git a/frame/3/herk/old/bli_herk_blk_var2.c b/frame/3/herk/old/bli_herk_blk_var2.c new file mode 100644 index 000000000..739ae0341 --- /dev/null +++ b/frame/3/herk/old/bli_herk_blk_var2.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_n( a, ah, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_ndim + ( + direct, thread, b, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a, + b1, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/herk/old/bli_herk_blk_var3.c b/frame/3/herk/old/bli_herk_blk_var3.c new file mode 100644 index 000000000..949ab53da --- /dev/null +++ b/frame/3/herk/old/bli_herk_blk_var3.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_k( a, b, c ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + // Notice that, unlike with gemm/hemm/symm/trmm/trsm, we do not need + // to call a kc-specific routine. We do not need kc to be a multiple + // of MR or NR since neither A nor B has structure in herk. + b_alg = bli_determine_blocksize( direct, i, k_trans, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a1, + b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c is an aliased obj_t (see _int() function), we can + // simply overwrite the internal beta scalar with BLIS_ONE once it + // has been used in the first iteration. + if ( i == 0 && bli_thread_am_ichief( thread ) ) + bli_obj_scalar_reset( c ); + } +} + diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/old/bli_herk_int.c similarity index 91% rename from frame/3/herk/bli_herk_int.c rename to frame/3/herk/old/bli_herk_int.c index 409b693a5..b7d58940b 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/old/bli_herk_int.c @@ -34,7 +34,7 @@ #include "blis.h" -#if 1 +#if 0 static gemm_voft vars[4][3] = { // unblocked optimized unblocked blocked @@ -53,7 +53,7 @@ void bli_herk_int obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -78,9 +78,9 @@ void bli_herk_int if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *ah ) ) { - if( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_obarrier( thread ); return; } @@ -96,24 +96,26 @@ void bli_herk_int // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. +#if 0 if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } +#endif // If alpha is non-unit, typecast and apply it to the scalar // attached to A'. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &ah_local ); + bli_obj_scalar_apply_scalar( alpha, &ah_local ); } // If beta is non-unit, typecast and apply it to the scalar // attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } #if 0 @@ -122,15 +124,17 @@ void bli_herk_int else uplo = 1; #endif +#if 0 // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; +#endif // Extract the function pointer from the current control tree node. - //f = bli_cntl_sub_prob( cntl ); + f = bli_cntl_var_func( cntl ); // Invoke the variant. f @@ -140,7 +144,7 @@ void bli_herk_int &c_local, cntx, cntl, - thread + thread ); } diff --git a/frame/3/herk/bli_herk_int.h b/frame/3/herk/old/bli_herk_int.h similarity index 98% rename from frame/3/herk/bli_herk_int.h rename to frame/3/herk/old/bli_herk_int.h index c762b9372..1e649b968 100644 --- a/frame/3/herk/bli_herk_int.h +++ b/frame/3/herk/old/bli_herk_int.h @@ -40,6 +40,6 @@ void bli_herk_int obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 342f04512..b864ce06a 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -43,7 +43,7 @@ void bli_symm_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t a_local; @@ -63,7 +63,7 @@ void bli_symm_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -74,7 +74,7 @@ void bli_symm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( b_local ); @@ -88,22 +88,27 @@ void bli_symm_front bli_obj_swap( a_local, b_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h index b1ee691f2..6ba9a5aeb 100644 --- a/frame/3/symm/bli_symm_front.h +++ b/frame/3/symm/bli_symm_front.h @@ -41,5 +41,5 @@ void bli_symm_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 7d73dd17d..936c43635 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -42,7 +42,7 @@ void bli_syr2k_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t c_local; @@ -64,7 +64,7 @@ void bli_syr2k_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -83,52 +83,47 @@ void bli_syr2k_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( c_local ); } -#if 0 - // Invoke the internal back-end. - bli_her2k_int( alpha, - &a_local, - &bt_local, - alpha, - &b_local, - &at_local, - beta, - &c_local, - cntl ); -#else + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); + // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &bt_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &bt_local, + beta, + &c_local, + cntx, + cntl, + infos + ); - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); -#endif + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &b_local, + &at_local, + &BLIS_ONE, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h index 502fb033b..8d227c125 100644 --- a/frame/3/syr2k/bli_syr2k_front.h +++ b/frame/3/syr2k/bli_syr2k_front.h @@ -40,5 +40,5 @@ void bli_syr2k_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index e3c62245f..8b379ab0e 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -41,7 +41,7 @@ void bli_syrk_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t a_local; @@ -61,7 +61,7 @@ void bli_syrk_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -76,27 +76,32 @@ void bli_syrk_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( c_local ); } - - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &at_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &at_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h index 700b8e263..73f58baef 100644 --- a/frame/3/syrk/bli_syrk_front.h +++ b/frame/3/syrk/bli_syrk_front.h @@ -39,5 +39,5 @@ void bli_syrk_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/trmm/bli_trmm.h b/frame/3/trmm/bli_trmm.h index 056fedb50..4eeec84e0 100644 --- a/frame/3/trmm/bli_trmm.h +++ b/frame/3/trmm/bli_trmm.h @@ -33,7 +33,6 @@ */ #include "bli_trmm_front.h" -#include "bli_trmm_int.h" #include "bli_trmm_var.h" diff --git a/frame/3/trmm/bli_trmm_blk_var1.c b/frame/3/trmm/bli_trmm_blk_var1.c deleted file mode 100644 index 3b9eae428..000000000 --- a/frame/3/trmm/bli_trmm_blk_var1.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trmm_blk_var1 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread - ) -{ - obj_t b_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack = NULL; - obj_t* b_pack = NULL; - obj_t* c1_pack = NULL; - - dir_t direct; - - dim_t i; - dim_t b_alg; - - // Determine the direction in which to partition (forwards or backwards). - direct = bli_trmm_direct( a, b, c ); - - // Prune any zero region that exists along the partitioning dimension. - bli_trmm_prune_unref_mparts_m( a, b, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing B. - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize all pack objects that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack B (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_mdim( direct, thread, a, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trmm subproblem. - bli_trmm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/trmm/bli_trmm_blk_var2.c b/frame/3/trmm/bli_trmm_blk_var2.c deleted file mode 100644 index cf53b8e28..000000000 --- a/frame/3/trmm/bli_trmm_blk_var2.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trmm_blk_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread - ) -{ - obj_t a_pack_s; - obj_t b1_pack_s, c1_pack_s; - - obj_t b1, c1; - obj_t* a_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c1_pack = NULL; - - dir_t direct; - - dim_t i; - dim_t b_alg; - - // Determine the direction in which to partition (forwards or backwards). - direct = bli_trmm_direct( a, b, c ); - - // Prune any zero region that exists along the partitioning dimension. - bli_trmm_prune_unref_mparts_n( a, b, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A - bli_obj_init_pack( &a_pack_s ); - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for B and C that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &b1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_ndim( direct, thread, b, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, my_end, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for B1 and C1. - bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, b, &b1 ); - bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trmm subproblem. - bli_trmm_int( &BLIS_ONE, - a_pack, - b1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/trmm/bli_trmm_blk_var3.c b/frame/3/trmm/bli_trmm_blk_var3.c deleted file mode 100644 index f6a425b07..000000000 --- a/frame/3/trmm/bli_trmm_blk_var3.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trmm_blk_var3 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread - ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, b1_pack_s; - - obj_t a1, b1; - obj_t* a1_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c_pack = NULL; - - dir_t direct; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - // Determine the direction in which to partition (forwards or backwards). - direct = bli_trmm_direct( a, b, c ); - - // Prune any zero region that exists along the partitioning dimension. - bli_trmm_prune_unref_mparts_k( a, b, c ); - - if( bli_thread_am_ochief( thread ) ){ - // Initialize object for packing C - bli_obj_init_pack( &c_pack_s ); - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - // Initialize pack objects for A and B that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ){ - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &b1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: We call a trmm-specific function to determine the kc - // blocksize so that we can implement the "nudging" of kc to be - // a multiple of mr or nr, as needed. - b_alg = bli_trmm_determine_kc( direct, i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and B1. - bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, b, &b1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trmm subproblem. - bli_trmm_int( &BLIS_ONE, - a1_pack, - b1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ){ - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - } - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 55f58974b..689acbb72 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -41,7 +41,7 @@ void bli_trmm_front obj_t* a, obj_t* b, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t a_local; @@ -61,7 +61,7 @@ void bli_trmm_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -107,7 +107,7 @@ void bli_trmm_front // NOTE: We disable the optimization for 1x1 matrices since the concept // of row- vs. column storage breaks down. if ( !bli_obj_is_1x1( c_local ) ) - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); @@ -130,24 +130,28 @@ void bli_trmm_front bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trmm_int, - alpha, - &a_local, - &b_local, - &BLIS_ZERO, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + &BLIS_ZERO, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h index d47c40ce9..7a263fdb1 100644 --- a/frame/3/trmm/bli_trmm_front.h +++ b/frame/3/trmm/bli_trmm_front.h @@ -39,5 +39,5 @@ void bli_trmm_front obj_t* a, obj_t* b, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 61843d4c2..cc729834b 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -61,7 +61,7 @@ void bli_trmm_ll_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -311,7 +311,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 356ea1a37..eacf91795 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -61,7 +61,7 @@ void bli_trmm_lu_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -318,7 +318,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 581cfdf8d..f8b09a3f5 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -61,7 +61,7 @@ void bli_trmm_rl_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -318,7 +318,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 8033c42c2..3fb94c9d6 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -61,7 +61,7 @@ void bli_trmm_ru_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -319,7 +319,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index 12bfa0b9f..d3ac2fa34 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -46,13 +46,13 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( trmm_blk_var1 ) -GENPROT( trmm_blk_var2 ) -GENPROT( trmm_blk_var3 ) +//GENPROT( trmm_blk_var1 ) +//GENPROT( trmm_blk_var2 ) +//GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index 5b0a89659..cbec35678 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -46,7 +46,7 @@ void bli_trmm_xx_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { diff --git a/frame/3/trmm/old/bli_trmm_blk_var1.c b/frame/3/trmm/old/bli_trmm_blk_var1.c new file mode 100644 index 000000000..9f2e91d07 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var1.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_m( a, b, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_mdim + ( + direct, thread, a, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a1, + b, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/trmm/old/bli_trmm_blk_var2.c b/frame/3/trmm/old/bli_trmm_blk_var2.c new file mode 100644 index 000000000..df5f58614 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var2.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_n( a, b, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_ndim + ( + direct, thread, b, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a, + b1, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/trmm/old/bli_trmm_blk_var3.c b/frame/3/trmm/old/bli_trmm_blk_var3.c new file mode 100644 index 000000000..2957cf153 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var3.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_k( a, b, c ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_trmm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a1, + b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + + // Unlike variant 3 of gemm and herk, which reset the internal scalar + // on C at the end of the first iteration so that subsequent iterations + // do not erroneously apply beta more than once, it is important that + // this behavior not be applied to trmm. That is because the order of + // computation is always such that beta must be zero, since the macro- + // kernel only applies beta to the row-panel of C that corresponds to + // the current block intersecting the diagonal. It turns out that this + // same pattern works for trmm3 as well--by only applying beta to + // the current row-panel of C, beta is applied to all of C exactly + // once. Thus, for neither trmm nor trmm3 should we reset the scalar + // on C after the first iteration. + } +} + diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/old/bli_trmm_int.c similarity index 98% rename from frame/3/trmm/bli_trmm_int.c rename to frame/3/trmm/old/bli_trmm_int.c index d6f4ca4be..830a22d1f 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/old/bli_trmm_int.c @@ -34,7 +34,7 @@ #include "blis.h" -#if 1 +#if 0 static gemm_voft vars[4][3] = { // unblocked optimized unblocked blocked @@ -53,7 +53,7 @@ void bli_trmm_int obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -96,11 +96,13 @@ void bli_trmm_int // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. +#if 0 if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } +#endif // If alpha is non-unit, typecast and apply it to the scalar attached // to B. @@ -134,15 +136,17 @@ void bli_trmm_int } #endif +#if 0 // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; +#endif // Extract the function pointer from the current control tree node. - //f = bli_cntl_sub_prob( cntl ); + f = bli_cntl_var_func( cntl ); // Invoke the variant. f diff --git a/frame/3/trmm/bli_trmm_int.h b/frame/3/trmm/old/bli_trmm_int.h similarity index 98% rename from frame/3/trmm/bli_trmm_int.h rename to frame/3/trmm/old/bli_trmm_int.h index 29c578324..697fc06b5 100644 --- a/frame/3/trmm/bli_trmm_int.h +++ b/frame/3/trmm/old/bli_trmm_int.h @@ -40,7 +40,7 @@ void bli_trmm_int obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index eb816d8fc..e9e9261f0 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -43,7 +43,7 @@ void bli_trmm3_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ) { obj_t a_local; @@ -63,7 +63,7 @@ void bli_trmm3_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -106,7 +106,7 @@ void bli_trmm3_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); @@ -130,22 +130,27 @@ void bli_trmm3_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trmm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h index 8f4feaba1..ed158c0b8 100644 --- a/frame/3/trmm3/bli_trmm3_front.h +++ b/frame/3/trmm3/bli_trmm3_front.h @@ -41,5 +41,5 @@ void bli_trmm3_front obj_t* beta, obj_t* c, cntx_t* cntx, - gemm_t* cntl + cntl_t* cntl ); diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 7d479a90a..1634efa0c 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -40,55 +40,32 @@ void bli_trsm_blk_var1 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { - obj_t b_pack_s; - obj_t a1_pack_s; - obj_t a1, c1; - obj_t* b_pack = NULL; - obj_t* a1_pack = NULL; dir_t direct; dim_t i; dim_t b_alg; + dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_trsm_direct( a, b, c ); + direct = bli_l3_direct( a, b, c, cntx ); // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_m( a, b, c ); + bli_l3_prune_unref_mparts_m( a, b, c, cntx ); - // Initialize object for packing B. - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); + // Determine the current thread's subpartition range. + bli_thread_get_range_mdim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); - // Initialize object for packing B. - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - - // Pack B1 (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_mdim( direct, thread, a, - ( bli_obj_root_is_triangular( *a ) ? - bli_cntx_get_bmult( BLIS_MR, cntx ) : - bli_cntx_get_bmult( BLIS_NR, cntx ) ), - &my_start, &my_end ); - - // Partition along the remaining portion of the m dimension. + // Partition along the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. @@ -101,36 +78,20 @@ void bli_trsm_blk_var1 bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); - // Initialize object for packing A1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - &c1, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - } + bli_trsm_int + ( + &BLIS_ONE, + &a1, + b, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ) - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); + bli_thread_ibarrier( thread ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index f9bd6d135..c2ca6b3ed 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -40,63 +40,30 @@ void bli_trsm_blk_var2 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { - obj_t a_pack_s; - obj_t b1_pack_s, c1_pack_s; - - obj_t b1, c1; - obj_t* a_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c1_pack = NULL; + obj_t b1, c1; dir_t direct; dim_t i; dim_t b_alg; + dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_trsm_direct( a, b, c ); + direct = bli_l3_direct( a, b, c, cntx ); // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_n( a, b, c ); + bli_l3_prune_unref_mparts_n( a, b, c, cntx ); - // Initialize pack objects for A that are passed into packm_init(). - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &a_pack_s ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for B and C that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &b1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_ndim( direct, thread, b, - ( bli_obj_root_is_triangular( *b ) ? - bli_cntx_get_bmult( BLIS_MR, cntx ) : - bli_cntx_get_bmult( BLIS_NR, cntx ) ), - &my_start, &my_end ); + // Determine the current thread's subpartition range. + bli_thread_get_range_ndim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); // Partition along the n dimension. for ( i = my_start; i < my_end; i += b_alg ) @@ -111,50 +78,20 @@ void bli_trsm_blk_var2 bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a_pack, - b1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); + bli_trsm_int + ( + &BLIS_ONE, + a, + &b1, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); + bli_thread_ibarrier( thread ); } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } } diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 130d2281c..9d726389f 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -40,56 +40,23 @@ void bli_trsm_blk_var3 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { - obj_t c_pack_s; - obj_t a1_pack_s, b1_pack_s; - - obj_t a1, b1; - obj_t* a1_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c_pack = NULL; + obj_t a1, b1; dir_t direct; - dim_t i; - dim_t b_alg; - dim_t k_trans; + dim_t i; + dim_t b_alg; + dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_trsm_direct( a, b, c ); + direct = bli_l3_direct( a, b, c, cntx ); // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_k( a, b, c ); - - // Initialize pack objects for C that are passed into packm_init(). - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &c_pack_s ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &b1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); + bli_l3_prune_unref_mparts_k( a, b, c, cntx ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); @@ -98,9 +65,6 @@ void bli_trsm_blk_var3 for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - // NOTE: We call a trsm-specific function to determine the kc - // blocksize so that we can implement the "nudging" of kc to be - // a multiple of mr, as needed. b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, bli_cntl_bszid( cntl ), cntx ); @@ -110,61 +74,29 @@ void bli_trsm_blk_var3 bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a1_pack, - b1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); + bli_trsm_int + ( + &BLIS_ONE, + &a1, + &b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + bli_thread_ibarrier( thread ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) { - bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( c_pack ); - } + if ( i == 0 ) + { + bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c ); + } } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) { - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - } - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - } } diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 3a83faafd..b4f7422ba 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -34,235 +34,177 @@ #include "blis.h" -extern scalm_t* scalm_cntl; - -extern gemm_t* gemm_cntl_bp_ke; - -packm_t* trsm_l_packa_cntl = NULL; -packm_t* trsm_l_packb_cntl = NULL; - -packm_t* trsm_r_packa_cntl = NULL; -packm_t* trsm_r_packb_cntl = NULL; - -trsm_t* trsm_cntl_bp_ke = NULL; - -trsm_t* trsm_l_cntl_op_bp = NULL; -trsm_t* trsm_l_cntl_mm_op = NULL; -trsm_t* trsm_l_cntl_vl_mm = NULL; - -trsm_t* trsm_r_cntl_op_bp = NULL; -trsm_t* trsm_r_cntl_mm_op = NULL; -trsm_t* trsm_r_cntl_vl_mm = NULL; - -trsm_t* trsm_l_cntl = NULL; -trsm_t* trsm_r_cntl = NULL; - - -void bli_trsm_cntl_init() +cntl_t* bli_trsm_cntl_create + ( + side_t side + ) { - - // Create control tree objects for packm operations (left side). - trsm_l_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - // IMPORTANT: n dim multiple must be mr to - // support right and bottom-right edge cases - BLIS_MR, - BLIS_MR, - TRUE, // invert diagonal - TRUE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - trsm_l_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - // IMPORTANT: m dim multiple must be mr since - // B_pack is updated (ie: serves as C) in trsm - BLIS_MR, - BLIS_NR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); - - // Create control tree objects for packm operations (right side). - trsm_r_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_NR, - BLIS_MR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - trsm_r_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // pack panels of B compactly - BLIS_MR, - BLIS_MR, - TRUE, // invert diagonal - FALSE, // reverse iteration if upper? - TRUE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); - - - // Create control tree object for lowest-level block-panel kernel. - trsm_cntl_bp_ke - = - bli_trsm_cntl_obj_create( BLIS_UNB_OPT, - BLIS_VARIANT2, - 0, // bszid_t not used by macro-kernel - NULL, NULL, NULL, NULL, - NULL, NULL, NULL ); - - // Create control tree object for outer panel (to block-panel) - // problem (left side). - trsm_l_cntl_op_bp - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - trsm_l_packa_cntl, - trsm_l_packb_cntl, - NULL, - trsm_cntl_bp_ke, - gemm_cntl_bp_ke, - NULL ); - - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates (left side). - trsm_l_cntl_mm_op - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - trsm_l_cntl_op_bp, - NULL, - NULL ); - - // Create control tree object for very large problem via multiple - // general problems (left side). - trsm_l_cntl_vl_mm - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - trsm_l_cntl_mm_op, - NULL, - NULL ); - - // Create control tree object for outer panel (to block-panel) - // problem (right side). - trsm_r_cntl_op_bp - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - trsm_r_packa_cntl, - trsm_r_packb_cntl, - NULL, - trsm_cntl_bp_ke, - gemm_cntl_bp_ke, - NULL ); - - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates (right side). - trsm_r_cntl_mm_op - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - trsm_r_cntl_op_bp, - NULL, - NULL ); - - // Create control tree object for very large problem via multiple - // general problems (right side). - trsm_r_cntl_vl_mm - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - trsm_r_cntl_mm_op, - NULL, - NULL ); - - // Alias the "master" trsm control trees to shorter names. - trsm_l_cntl = trsm_l_cntl_vl_mm; - trsm_r_cntl = trsm_r_cntl_vl_mm; + if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create(); + else return bli_trsm_r_cntl_create(); } -void bli_trsm_cntl_finalize() +cntl_t* bli_trsm_l_cntl_create + ( + void + ) { - bli_cntl_obj_free( trsm_l_packa_cntl ); - bli_cntl_obj_free( trsm_l_packb_cntl ); - bli_cntl_obj_free( trsm_r_packa_cntl ); - bli_cntl_obj_free( trsm_r_packb_cntl ); + void* macro_kernel_p = bli_trsm_xx_ker_var2; - bli_cntl_obj_free( trsm_cntl_bp_ke ); + // Create a node for the macro-kernel. + cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + ( + BLIS_NR, // bszid not used by macro-kernel. + macro_kernel_p, + NULL // no sub-node; this is the leaf of the tree. + ); - bli_cntl_obj_free( trsm_l_cntl_op_bp ); - bli_cntl_obj_free( trsm_l_cntl_mm_op ); - bli_cntl_obj_free( trsm_l_cntl_vl_mm ); - bli_cntl_obj_free( trsm_r_cntl_op_bp ); - bli_cntl_obj_free( trsm_r_cntl_mm_op ); - bli_cntl_obj_free( trsm_r_cntl_vl_mm ); + // Create a node for packing matrix A. + cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_trsm_packa, + bli_packm_blk_var1, + BLIS_MR, + BLIS_MR, + TRUE, // do NOT invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + trsm_cntl_bp_ke + ); + + // Create a node for partitioning the m dimension by MC. + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + ( + BLIS_MC, + bli_trsm_blk_var1, + trsm_cntl_packa + ); + + // Create a node for packing matrix B. + cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_trsm_packb, + bli_packm_blk_var1, + BLIS_MR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + trsm_cntl_op_bp + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + ( + BLIS_KC, + bli_trsm_blk_var3, + trsm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + ( + BLIS_NC, + bli_trsm_blk_var2, + trsm_cntl_mm_op + ); + + return trsm_cntl_vl_mm; } -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_packm_a, - packm_t* sub_packm_b, - packm_t* sub_packm_c, - trsm_t* sub_trsm, - gemm_t* sub_gemm, - unpackm_t* sub_unpackm_c ) +cntl_t* bli_trsm_r_cntl_create + ( + void + ) { - trsm_t* cntl; + void* macro_kernel_p = bli_trsm_xx_ker_var2; - cntl = ( trsm_t* ) bli_malloc_intl( sizeof(trsm_t) ); + // Create a node for the macro-kernel. + cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + ( + BLIS_NR, // bszid not used by macro-kernel. + macro_kernel_p, + NULL // no sub-node; this is the leaf of the tree. + ); - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bszid = bszid; - cntl->sub_scalm = sub_scalm; - cntl->sub_packm_a = sub_packm_a; - cntl->sub_packm_b = sub_packm_b; - cntl->sub_packm_c = sub_packm_c; - cntl->sub_trsm = sub_trsm; - cntl->sub_gemm = sub_gemm; - cntl->sub_unpackm_c = sub_unpackm_c; + // Create a node for packing matrix A. + cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_trsm_packa, + bli_packm_blk_var1, + BLIS_NR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + trsm_cntl_bp_ke + ); - return cntl; + // Create a node for partitioning the m dimension by MC. + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + ( + BLIS_MC, + bli_trsm_blk_var1, + trsm_cntl_packa + ); + + // Create a node for packing matrix B. + cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_trsm_packb, + bli_packm_blk_var1, + BLIS_MR, + BLIS_MR, + TRUE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + trsm_cntl_op_bp + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + ( + BLIS_KC, + bli_trsm_blk_var3, + trsm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + ( + BLIS_NC, + bli_trsm_blk_var2, + trsm_cntl_mm_op + ); + + return trsm_cntl_vl_mm; +} + +void bli_trsm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) +{ + bli_cntl_free( cntl, thread ); +} + +// ----------------------------------------------------------------------------- + +cntl_t* bli_trsm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ) +{ + return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 651cc8599..6dbe9adce 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -32,33 +32,33 @@ */ -struct trsm_s -{ - impl_t impl_type; - varnum_t var_num; - bszid_t bszid; - struct scalm_s* sub_scalm; - struct packm_s* sub_packm_a; - struct packm_s* sub_packm_b; - struct packm_s* sub_packm_c; - struct trsm_s* sub_trsm; - struct gemm_s* sub_gemm; - struct unpackm_s* sub_unpackm_c; -}; -typedef struct trsm_s trsm_t; +cntl_t* bli_trsm_cntl_create + ( + side_t side + ); -#define bli_cntl_sub_trsm( cntl ) cntl->sub_trsm +cntl_t* bli_trsm_l_cntl_create + ( + void + ); -void bli_trsm_cntl_init( void ); -void bli_trsm_cntl_finalize( void ); -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_pack_a, - packm_t* sub_pack_b, - packm_t* sub_pack_c, - trsm_t* sub_trsm, - gemm_t* sub_gemm, - unpackm_t* sub_unpack_c ); +cntl_t* bli_trsm_r_cntl_create + ( + void + ); + +void bli_trsm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +// ----------------------------------------------------------------------------- + +cntl_t* bli_trsm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 0e6e5a2c2..3466d2d18 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -41,11 +41,9 @@ void bli_trsm_front obj_t* a, obj_t* b, cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl + cntl_t* cntl ) { - trsm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; @@ -63,7 +61,7 @@ void bli_trsm_front // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -118,26 +116,27 @@ void bli_trsm_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Choose the control tree. - if ( bli_is_left( side ) ) cntl = l_cntl; - else cntl = r_cntl; + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRSM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trsm_int, - alpha, - &a_local, - &b_local, - alpha, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_trsm_int, + alpha, + &a_local, + &b_local, + alpha, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index c80156d72..84feef22f 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -39,6 +39,5 @@ void bli_trsm_front obj_t* a, obj_t* b, cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl + cntl_t* cntl ); diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index a517a6cc3..e6614cb3f 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -34,17 +34,6 @@ #include "blis.h" -#if 1 -static trsm_voft vars[4][3] = -{ - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1 }, - { NULL, bli_trsm_xx_ker_var2, bli_trsm_blk_var2 }, - { NULL, NULL, bli_trsm_blk_var3 }, - { NULL, NULL, NULL }, -}; -#endif - void bli_trsm_int ( obj_t* alpha, @@ -53,18 +42,13 @@ void bli_trsm_int obj_t* beta, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; -#if 0 - bool_t side, uplo; -#endif - varnum_t n; - impl_t i; trsm_voft f; // Check parameters. @@ -78,9 +62,9 @@ void bli_trsm_int if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - if( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_obarrier( thread ); return; } @@ -98,14 +82,14 @@ void bli_trsm_int // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -113,28 +97,15 @@ void bli_trsm_int // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( *a ) ) { -#if 0 - side = 0; - if ( bli_obj_root_is_lower( *a ) ) uplo = 0; - else uplo = 1; -#endif - // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( *b ) ) { -#if 0 - side = 1; - // Set a bool based on the uplo field of A's root object. - if ( bli_obj_root_is_lower( *b ) ) uplo = 0; - else uplo = 1; -#endif - // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) @@ -143,17 +114,11 @@ void bli_trsm_int } } - bli_thread_obarrier( thread ); - - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // FGVZ->TMS: Is this barrier still needed? + bli_thread_obarrier( thread ); // Extract the function pointer from the current control tree node. - //f = bli_cntl_sub_prob( cntl ); + f = bli_cntl_var_func( cntl ); // Invoke the variant. f @@ -163,7 +128,7 @@ void bli_trsm_int &c_local, cntx, cntl, - thread + thread ); } diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/trsm/bli_trsm_int.h index a379ea002..a147a3298 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/trsm/bli_trsm_int.h @@ -40,7 +40,7 @@ void bli_trsm_int obj_t* beta, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 07ba4361f..b7d695318 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -61,7 +61,7 @@ void bli_trsm_ll_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -96,10 +96,11 @@ void bli_trsm_ll_ker_var2 FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to B. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of B prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index ba34f1c3a..763592644 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -61,7 +61,7 @@ void bli_trsm_lu_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -96,10 +96,11 @@ void bli_trsm_lu_ker_var2 FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to B. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of B prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.h b/frame/3/trsm/bli_trsm_packab.c similarity index 63% rename from frame/3/gemm/ind/bli_gemm_ker_var4.h rename to frame/3/trsm/bli_trsm_packab.c index ad72fdd67..3a32ce097 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.h +++ b/frame/3/trsm/bli_trsm_packab.c @@ -32,44 +32,79 @@ */ +#include "blis.h" -// -// Prototype object-based interface. -// -void bli_gemm_ker_var4 +void bli_trsm_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, - gemm_t* cntl, + cntl_t* cntl, thrinfo_t* thread - ); + ) +{ + obj_t a_pack; + // Pack matrix A according to the control tree node. + bli_l3_packm + ( + a, + &a_pack, + cntx, + cntl, + thread + ); -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); + // Proceed with execution using packed matrix A. + bli_trsm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} -INSERT_GENTPROT_BASIC( gemm_ker_var4 ) +// ----------------------------------------------------------------------------- + +void bli_trsm_packb + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b_pack; + + // Pack matrix B according to the control tree node. + bli_l3_packm + ( + b, + &b_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix B. + bli_trsm_int + ( + &BLIS_ONE, + a, + &b_pack, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 7a25b1ce5..a18e88939 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -61,7 +61,7 @@ void bli_trsm_rl_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -96,10 +96,11 @@ void bli_trsm_rl_ker_var2 FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to A. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of A prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index d610925f3..f5dad161b 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -61,7 +61,7 @@ void bli_trsm_ru_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { @@ -96,10 +96,11 @@ void bli_trsm_ru_ker_var2 FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to A. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of A prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 9c526d820..2ff45fa13 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -46,13 +46,15 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - trsm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) +GENPROT( trsm_packa ) +GENPROT( trsm_packb ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index ad1238319..8409432ca 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -46,7 +46,7 @@ void bli_trsm_xx_ker_var2 obj_t* b, obj_t* c, cntx_t* cntx, - trsm_t* cntl, + cntl_t* cntl, thrinfo_t* thread ) { diff --git a/frame/3/trsm/old/bli_trsm_cntl.c b/frame/3/trsm/old/bli_trsm_cntl.c new file mode 100644 index 000000000..3a83faafd --- /dev/null +++ b/frame/3/trsm/old/bli_trsm_cntl.c @@ -0,0 +1,268 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern gemm_t* gemm_cntl_bp_ke; + +packm_t* trsm_l_packa_cntl = NULL; +packm_t* trsm_l_packb_cntl = NULL; + +packm_t* trsm_r_packa_cntl = NULL; +packm_t* trsm_r_packb_cntl = NULL; + +trsm_t* trsm_cntl_bp_ke = NULL; + +trsm_t* trsm_l_cntl_op_bp = NULL; +trsm_t* trsm_l_cntl_mm_op = NULL; +trsm_t* trsm_l_cntl_vl_mm = NULL; + +trsm_t* trsm_r_cntl_op_bp = NULL; +trsm_t* trsm_r_cntl_mm_op = NULL; +trsm_t* trsm_r_cntl_vl_mm = NULL; + +trsm_t* trsm_l_cntl = NULL; +trsm_t* trsm_r_cntl = NULL; + + +void bli_trsm_cntl_init() +{ + + // Create control tree objects for packm operations (left side). + trsm_l_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + // IMPORTANT: n dim multiple must be mr to + // support right and bottom-right edge cases + BLIS_MR, + BLIS_MR, + TRUE, // invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm_l_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + // IMPORTANT: m dim multiple must be mr since + // B_pack is updated (ie: serves as C) in trsm + BLIS_MR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (right side). + trsm_r_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_NR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm_r_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, // pack panels of B compactly + BLIS_MR, + BLIS_MR, + TRUE, // invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + trsm_cntl_bp_ke + = + bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + 0, // bszid_t not used by macro-kernel + NULL, NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (left side). + trsm_l_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_MC, + NULL, + trsm_l_packa_cntl, + trsm_l_packb_cntl, + NULL, + trsm_cntl_bp_ke, + gemm_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (left side). + trsm_l_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + BLIS_KC, + NULL, + NULL, + NULL, + NULL, + trsm_l_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (left side). + trsm_l_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + BLIS_NC, + NULL, + NULL, + NULL, + NULL, + trsm_l_cntl_mm_op, + NULL, + NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (right side). + trsm_r_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_MC, + NULL, + trsm_r_packa_cntl, + trsm_r_packb_cntl, + NULL, + trsm_cntl_bp_ke, + gemm_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (right side). + trsm_r_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + BLIS_KC, + NULL, + NULL, + NULL, + NULL, + trsm_r_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (right side). + trsm_r_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + BLIS_NC, + NULL, + NULL, + NULL, + NULL, + trsm_r_cntl_mm_op, + NULL, + NULL ); + + // Alias the "master" trsm control trees to shorter names. + trsm_l_cntl = trsm_l_cntl_vl_mm; + trsm_r_cntl = trsm_r_cntl_vl_mm; +} + +void bli_trsm_cntl_finalize() +{ + bli_cntl_obj_free( trsm_l_packa_cntl ); + bli_cntl_obj_free( trsm_l_packb_cntl ); + bli_cntl_obj_free( trsm_r_packa_cntl ); + bli_cntl_obj_free( trsm_r_packb_cntl ); + + bli_cntl_obj_free( trsm_cntl_bp_ke ); + + bli_cntl_obj_free( trsm_l_cntl_op_bp ); + bli_cntl_obj_free( trsm_l_cntl_mm_op ); + bli_cntl_obj_free( trsm_l_cntl_vl_mm ); + bli_cntl_obj_free( trsm_r_cntl_op_bp ); + bli_cntl_obj_free( trsm_r_cntl_mm_op ); + bli_cntl_obj_free( trsm_r_cntl_vl_mm ); +} + +trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, + varnum_t var_num, + bszid_t bszid, + scalm_t* sub_scalm, + packm_t* sub_packm_a, + packm_t* sub_packm_b, + packm_t* sub_packm_c, + trsm_t* sub_trsm, + gemm_t* sub_gemm, + unpackm_t* sub_unpackm_c ) +{ + trsm_t* cntl; + + cntl = ( trsm_t* ) bli_malloc_intl( sizeof(trsm_t) ); + + cntl->impl_type = impl_type; + cntl->var_num = var_num; + cntl->bszid = bszid; + cntl->sub_scalm = sub_scalm; + cntl->sub_packm_a = sub_packm_a; + cntl->sub_packm_b = sub_packm_b; + cntl->sub_packm_c = sub_packm_c; + cntl->sub_trsm = sub_trsm; + cntl->sub_gemm = sub_gemm; + cntl->sub_unpackm_c = sub_unpackm_c; + + return cntl; +} + diff --git a/frame/1/packv/bli_packv_cntl.h b/frame/3/trsm/old/bli_trsm_cntl.h similarity index 61% rename from frame/1/packv/bli_packv_cntl.h rename to frame/3/trsm/old/bli_trsm_cntl.h index d4682f085..651cc8599 100644 --- a/frame/1/packv/bli_packv_cntl.h +++ b/frame/3/trsm/old/bli_trsm_cntl.h @@ -32,32 +32,33 @@ */ -struct packv_s +struct trsm_s { - impl_t impl_type; - varnum_t var_num; - bszid_t bmid; - pack_t pack_schema; + impl_t impl_type; + varnum_t var_num; + bszid_t bszid; + struct scalm_s* sub_scalm; + struct packm_s* sub_packm_a; + struct packm_s* sub_packm_b; + struct packm_s* sub_packm_c; + struct trsm_s* sub_trsm; + struct gemm_s* sub_gemm; + struct unpackm_s* sub_unpackm_c; }; -typedef struct packv_s packv_t; +typedef struct trsm_s trsm_t; -#define cntl_bmid( cntl ) cntl->bmid +#define bli_cntl_sub_trsm( cntl ) cntl->sub_trsm -#define bli_cntl_sub_packv( cntl ) cntl->sub_packv -#define bli_cntl_sub_packv_x( cntl ) cntl->sub_packv_x -#define bli_cntl_sub_packv_x1( cntl ) cntl->sub_packv_x1 -#define bli_cntl_sub_packv_y( cntl ) cntl->sub_packv_y -#define bli_cntl_sub_packv_y1( cntl ) cntl->sub_packv_y1 - -void bli_packv_cntl_init( void ); -void bli_packv_cntl_finalize( void ); -packv_t* bli_packv_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid, - pack_t pack_schema ); -void bli_packv_cntl_obj_init( packv_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid, - pack_t pack_schema ); +void bli_trsm_cntl_init( void ); +void bli_trsm_cntl_finalize( void ); +trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, + varnum_t var_num, + bszid_t bszid, + scalm_t* sub_scalm, + packm_t* sub_pack_a, + packm_t* sub_pack_b, + packm_t* sub_pack_c, + trsm_t* sub_trsm, + gemm_t* sub_gemm, + unpackm_t* sub_unpack_c ); diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.h b/frame/base/bli_auxinfo.h similarity index 59% rename from frame/1m/unpackm/bli_unpackm_blk_var2.h rename to frame/base/bli_auxinfo.h index 1f783260a..aee1869a0 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.h +++ b/frame/base/bli_auxinfo.h @@ -32,30 +32,39 @@ */ -void bli_unpackm_blk_var2( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ); +#ifndef BLIS_AUXINFO_MACRO_DEFS_H +#define BLIS_AUXINFO_MACRO_DEFS_H -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_panel, \ - dim_t n_panel, \ - void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); +// auxinfo_t field query -INSERT_GENTPROT_BASIC( unpackm_blk_var2 ) +#define bli_auxinfo_schema_a( auxinfo ) ( (auxinfo)->schema_a ) +#define bli_auxinfo_schema_b( auxinfo ) ( (auxinfo)->schema_b ) + +#define bli_auxinfo_next_a( auxinfo ) ( (auxinfo)->a_next ) +#define bli_auxinfo_next_b( auxinfo ) ( (auxinfo)->b_next ) + +#define bli_auxinfo_is_a( auxinfo ) ( (auxinfo)->is_a ) +#define bli_auxinfo_is_b( auxinfo ) ( (auxinfo)->is_b ) + + +// auxinfo_t field modification + +#define bli_auxinfo_set_schema_a( schema, auxinfo ) { (auxinfo).schema_a = schema; } +#define bli_auxinfo_set_schema_b( schema, auxinfo ) { (auxinfo).schema_b = schema; } + +#define bli_auxinfo_set_next_a( a_p, auxinfo ) { (auxinfo).a_next = a_p; } +#define bli_auxinfo_set_next_b( b_p, auxinfo ) { (auxinfo).b_next = b_p; } + +#define bli_auxinfo_set_next_ab( a_p, b_p, auxinfo ) \ +{ \ + bli_auxinfo_set_next_a( a_p, auxinfo ); \ + bli_auxinfo_set_next_b( b_p, auxinfo ); \ +} + +#define bli_auxinfo_set_is_a( is, auxinfo ) { (auxinfo).is_a = is; } +#define bli_auxinfo_set_is_b( is, auxinfo ) { (auxinfo).is_b = is; } + + +#endif diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c new file mode 100644 index 000000000..3b39befe4 --- /dev/null +++ b/frame/base/bli_cntl.c @@ -0,0 +1,186 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +cntl_t* bli_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + void* params, + cntl_t* sub_node + ) +{ + cntl_t* cntl; + mem_t* pack_mem; + + // Allocate the cntl_t struct. + cntl = bli_malloc_intl( sizeof( cntl_t ) ); + + bli_cntl_set_bszid( bszid, cntl ); + bli_cntl_set_var_func( var_func, cntl ); + bli_cntl_set_params( params, cntl ); + bli_cntl_set_sub_node( sub_node, cntl ); + + // Query the address of the node's packed mem_t entry so we can initialize + // key fields (to NULL or 0). + // NOTE: This initialization is important, since it allows threads to + // discern whether blocks have been acquired from the memory allocator. + pack_mem = bli_cntl_pack_mem( cntl ); + bli_mem_clear( pack_mem ); + + return cntl; +} + +void bli_cntl_obj_free + ( + cntl_t* cntl + ) +{ + bli_free_intl( cntl ); +} + +void bli_cntl_obj_clear + ( + cntl_t* cntl + ) +{ + mem_t* pack_mem; + + // Clear various fields in the control tree. Clearing these fields + // actually is not needed, but we do it for debugging/completeness. + bli_cntl_set_var_func( NULL, cntl ); + bli_cntl_set_params( NULL, cntl ); + bli_cntl_set_sub_node( NULL, cntl ); + + // Clearing these fields is potentially more important if the control + // tree is cached somewhere and reused. + pack_mem = bli_cntl_pack_mem( cntl ); + bli_mem_clear( pack_mem ); +} + +// ----------------------------------------------------------------------------- + +void bli_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Base case: simply return when asked to free NULL nodes. + if ( cntl == NULL ) return; + + cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); + void* cntl_params = bli_cntl_params( cntl ); + mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); + + thrinfo_t* thread_sub_node = bli_thrinfo_sub_node( thread ); + + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free( cntl_sub_node, thread_sub_node ); + + // Free the current node's params field, if it is non-NULL. + if ( cntl_params != NULL ) + { + bli_free_intl( cntl_params ); + } + + // Release the current node's pack mem_t entry back to the memory + // broker from which it originated, but only if the current thread + // is chief for its group, and only if the mem_t is allocated. + if ( bli_thread_am_ochief( thread ) ) + if ( bli_mem_is_alloc( cntl_pack_mem ) ) + { + bli_membrk_release( cntl_pack_mem ); + } + + // Free the current node. + bli_cntl_obj_free( cntl ); +} + +// ----------------------------------------------------------------------------- + +cntl_t* bli_cntl_copy + ( + cntl_t* cntl + ) +{ + // Make a copy of the current node. Notice that the source node + // should NOT have any allocated/cached mem_t entries, and that + // bli_cntl_obj_create() creates a node with a cleared mem_t + // field. + cntl_t* cntl_copy = bli_cntl_obj_create + ( + bli_cntl_bszid( cntl ), + bli_cntl_var_func( cntl ), + NULL, NULL + ); + + // Check the params field of the existing control tree; if it's non-NULL, + // copy it. + if ( bli_cntl_params( cntl ) != NULL ) + { + // Detect the size of the params struct by reading the first field + // as a uint64_t, and then allocate this many bytes for a new params + // struct. + uint64_t params_size = bli_cntl_params_size( cntl ); + void* params_orig = bli_cntl_params( cntl ); + void* params_copy = bli_malloc_intl( ( size_t )params_size ); + + // Copy the original params struct to the new memory region. + memcpy( params_copy, params_orig, params_size ); + + // Save the address of the new params struct into the new control + // tree node. + bli_cntl_set_params( params_copy, cntl_copy ); + } + + // If the sub-node exists, copy it recursively. + if ( bli_cntl_sub_node( cntl ) != NULL ) + { + cntl_t* sub_node_copy = bli_cntl_copy + ( + bli_cntl_sub_node( cntl ) + ); + + // Save the address of the new sub-node (sub-tree) to the existing + // node. + bli_cntl_set_sub_node( sub_node_copy, cntl_copy ); + } + + // Return the address of the newly created node. + return cntl_copy; +} + diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h new file mode 100644 index 000000000..7b6000bb9 --- /dev/null +++ b/frame/base/bli_cntl.h @@ -0,0 +1,153 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +/* +// -- Control tree node definition -- + +struct cntl_s +{ + // Basic fields (usually required). + bszid_t bszid; + void* var_func; + struct cntl_s* sub_node; + + // Optional fields (needed only by some operations such as packm). + // NOTE: first field of params must be a uint64_t containing the size + // of the struct. + void* params; + + // Internal fields that track "cached" data. + mem_t pack_mem; +}; +typedef struct cntl_s cntl_t; +*/ + + +// -- Control tree prototypes -- + +cntl_t* bli_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + void* params, + cntl_t* sub_node + ); + +void bli_cntl_obj_free + ( + cntl_t* cntl + ); + +void bli_cntl_obj_clear + ( + cntl_t* cntl + ); + +void bli_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +cntl_t* bli_cntl_copy + ( + cntl_t* cntl + ); + +// ----------------------------------------------------------------------------- + +// cntl_t query (fields only) + +#define bli_cntl_bszid( cntl ) \ +\ + ( cntl->bszid ) + +#define bli_cntl_var_func( cntl ) \ +\ + ( cntl->var_func ) + +#define bli_cntl_sub_node( cntl ) \ +\ + ( cntl->sub_node ) + +#define bli_cntl_params( cntl ) \ +\ + ( cntl->params ) + +#define bli_cntl_params_size( cntl ) \ +\ + ( *( ( uint64_t* )(cntl->params) ) ) + +#define bli_cntl_pack_mem( cntl ) \ +\ + ( &(cntl->pack_mem) ) + +// cntl_t query (complex) + +#define bli_cntl_is_leaf( cntl ) \ +\ + ( bli_cntl_sub_node( cntl ) == NULL ) + +#define bli_cntl_does_part( cntl ) \ +\ + ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ) + +// cntl_t modification + +#define bli_cntl_set_bszid( bszid0, cntl ) \ +{ \ + cntl->bszid = bszid0; \ +} + +#define bli_cntl_set_var_func( var_func0, cntl ) \ +{ \ + cntl->var_func = var_func0; \ +} + +#define bli_cntl_set_sub_node( sub_node0, cntl ) \ +{ \ + cntl->sub_node = sub_node0; \ +} + +#define bli_cntl_set_params( params0, cntl ) \ +{ \ + cntl->params = params0; \ +} + +#define bli_cntl_set_pack_mem( pack_mem0, cntl ) \ +{ \ + cntl->pack_mem = *(pack_mem0); \ +} + diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index d06167a07..f2885cca3 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -713,6 +713,36 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); +} + +bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + num_t dt = bli_obj_datatype( *obj ); + + // Reference the ukr storage preferences of the corresponding real + // micro-kernel for induced methods. + if ( bli_cntx_get_ind_method( cntx ) != BLIS_NAT ) + dt = bli_obj_datatype_proj_to_real( *obj ); + + const bool_t ukr_prefers_rows + = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); + const bool_t ukr_prefers_cols + = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); + bool_t r_val = FALSE; + + if ( bli_obj_is_row_stored( *obj ) && ukr_prefers_cols ) r_val = TRUE; + else if ( bli_obj_is_col_stored( *obj ) && ukr_prefers_rows ) r_val = TRUE; + + return r_val; +} + // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) @@ -803,6 +833,12 @@ void bli_cntx_print( cntx_t* cntx ) ); } + { + ind_t family = bli_cntx_get_family( cntx ); + + printf( "oper family : %lu\n", ( guint_t )family ); + } + { ind_t method = bli_cntx_get_ind_method( cntx ); @@ -810,18 +846,3 @@ void bli_cntx_print( cntx_t* cntx ) } } - - - - - - - - - - - - - - - diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 337d233b3..21f9c0fe0 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -53,6 +53,7 @@ typedef struct cntx_s func_t packm_ukrs; + opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; @@ -102,6 +103,10 @@ typedef struct cntx_s \ (&((cntx)->packm_ukrs) ) +#define bli_cntx_family( cntx ) \ +\ + ( (cntx)->family ) + #define bli_cntx_method( cntx ) \ \ ( (cntx)->method ) @@ -164,6 +169,11 @@ typedef struct cntx_s (cntx_p)->packm_ukrs = _packm_ukrs; \ } +#define bli_cntx_set_family( _family, cntx_p ) \ +{ \ + (cntx_p)->family = _family; \ +} + #define bli_cntx_set_method( _method, cntx_p ) \ { \ (cntx_p)->method = _method; \ @@ -263,6 +273,10 @@ typedef struct cntx_s (dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \ ) +#define bli_cntx_get_family( cntx ) \ +\ + bli_cntx_family( cntx ) + #define bli_cntx_get_ind_method( cntx ) \ \ bli_cntx_method( cntx ) @@ -391,6 +405,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); // print function diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index fede4f823..4c63b604d 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -114,9 +114,9 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) // -- Memory pool-related ------------------------------------------------------ -gint_t bli_info_get_mk_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_A_BLOCK ); } -gint_t bli_info_get_kn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_B_PANEL ); } -gint_t bli_info_get_mn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_C_PANEL ); } +gint_t bli_info_get_mk_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_A_BLOCK ); } +gint_t bli_info_get_kn_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_B_PANEL ); } +gint_t bli_info_get_mn_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_C_PANEL ); } diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 6e793fa40..db598cede 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -83,17 +83,14 @@ err_t bli_init( void ) { // Initialize various sub-APIs. bli_const_init(); - bli_cntl_init(); bli_error_init(); - bli_mem_init(); + bli_memsys_init(); bli_ind_init(); bli_thread_init(); // After initialization is complete, mark BLIS as initialized. bli_is_init = TRUE; - //bli_mem_init(); - // Only the thread that actually performs the initialization will // return "success". r_val = BLIS_SUCCESS; @@ -150,9 +147,8 @@ err_t bli_finalize( void ) { // Finalize various sub-APIs. bli_const_finalize(); - bli_cntl_finalize(); bli_error_finalize(); - bli_mem_finalize(); + bli_memsys_finalize(); bli_ind_finalize(); bli_thread_finalize(); diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index 9ef741934..82bd88afb 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -36,18 +36,91 @@ #ifndef BLIS_MEM_H #define BLIS_MEM_H -// ----------------------------------------------------------------------------- -membrk_t* bli_mem_global_membrk( void ); -siz_t bli_mem_pool_size( packbuf_t buf_type ); +// Mem entry query -// ----------------------------------------------------------------------------- +#define bli_mem_pblk( mem_p ) \ +\ + ( &((mem_p)->pblk) ) -void bli_mem_init( void ); -void bli_mem_reinit( cntx_t* cntx ); -void bli_mem_finalize( void ); -bool_t bli_mem_is_initialized( void ); +#define bli_mem_buffer( mem_p ) \ +\ + ( bli_pblk_buf_align( bli_mem_pblk( mem_p ) ) ) + +#define bli_mem_buf_sys( mem_p ) \ +\ + ( bli_pblk_buf_sys( bli_mem_pblk( mem_p ) ) ) + +#define bli_mem_buf_type( mem_p ) \ +\ + ( (mem_p)->buf_type ) + +#define bli_mem_pool( mem_p ) \ +\ + ( (mem_p)->pool ) + +#define bli_mem_membrk( mem_p ) \ +\ + ( (mem_p)->membrk ) + +#define bli_mem_size( mem_p ) \ +\ + ( (mem_p)->size ) + +#define bli_mem_is_alloc( mem_p ) \ +\ + ( bli_mem_buffer( mem_p ) != NULL ) + +#define bli_mem_is_unalloc( mem_p ) \ +\ + ( bli_mem_buffer( mem_p ) == NULL ) -#endif +// Mem entry modification +#define bli_mem_set_pblk( pblk_p, mem_p ) \ +{ \ + mem_p->pblk = *(pblk_p); \ +} + +#define bli_mem_set_buffer( buf0, mem_p ) \ +{ \ + bli_pblk_set_buf_align( buf0, &(mem_p->pblk) ); \ +} + +#define bli_mem_set_buf_sys( buf0, mem_p ) \ +{ \ + bli_pblk_set_buf_sys( buf0, &(mem_p->pblk) ); \ +} + +#define bli_mem_set_buf_type( buf_type0, mem_p ) \ +{ \ + (mem_p)->buf_type = buf_type0; \ +} + +#define bli_mem_set_pool( pool0, mem_p ) \ +{ \ + (mem_p)->pool = pool0; \ +} + +#define bli_mem_set_membrk( membrk0, mem_p ) \ +{ \ + (mem_p)->membrk = membrk0; \ +} + +#define bli_mem_set_size( size0, mem_p ) \ +{ \ + mem_p->size = size0; \ +} + +#define bli_mem_clear( mem_p ) \ +{ \ + bli_mem_set_buffer( NULL, mem_p ); \ + bli_mem_set_buf_sys( NULL, mem_p ); \ + bli_mem_set_pool( NULL, mem_p ); \ + bli_mem_set_size( 0, mem_p ); \ + bli_mem_set_membrk( NULL, mem_p ); \ +} + + +#endif diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c new file mode 100644 index 000000000..e66aafa63 --- /dev/null +++ b/frame/base/bli_memsys.c @@ -0,0 +1,174 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS +pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +static membrk_t global_membrk; + +// ----------------------------------------------------------------------------- + +membrk_t* bli_memsys_global_membrk( void ) +{ + return &global_membrk; +} + +// ----------------------------------------------------------------------------- + +static bool_t bli_memsys_is_init = FALSE; + +void bli_memsys_init( void ) +{ + cntx_t cntx; + + // If the initialization flag is TRUE, we know the API is already + // initialized, so we can return early. + if ( bli_memsys_is_init == TRUE ) return; + + // Create and initialize a context for gemm so we have something + // to pass into bli_membrk_init_pools(). + bli_gemm_cntx_init( &cntx ); + +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // initialization actions once they are finally allowed into this + // critical section. + if ( bli_memsys_is_init == FALSE ) + { + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( &cntx, &global_membrk ); + + // After initialization, mark the API as initialized. + bli_memsys_is_init = TRUE; + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + + // Finalize the temporary gemm context. + bli_gemm_cntx_finalize( &cntx ); +} + +void bli_memsys_reinit( cntx_t* cntx ) +{ +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // If for some reason the memory pools have not yet been + // initialized (unlikely), we emulate the body of bli_memsys_init(). + if ( bli_memsys_is_init == FALSE ) + { + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( cntx, &global_membrk ); + + // After initialization, mark the API as initialized. + bli_memsys_is_init = TRUE; + } + else + { + // Reinitialize the global membrk_t object's memory pools. + bli_membrk_reinit_pools( cntx, &global_membrk ); + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif +} + +void bli_memsys_finalize( void ) +{ + // If the initialization flag is FALSE, we know the API is already + // uninitialized, so we can return early. + if ( bli_memsys_is_init == FALSE ) return; + +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // finalization actions once they are finally allowed into this + // critical section. + if ( bli_memsys_is_init == TRUE ) + { + // Finalize the global membrk_t object and its memory pools. + bli_membrk_finalize( &global_membrk ); + + // After finalization, mark the API as uninitialized. + bli_memsys_is_init = FALSE; + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif +} + +bool_t bli_memsys_is_initialized( void ) +{ + return bli_memsys_is_init; +} + diff --git a/frame/base/bli_memsys.h b/frame/base/bli_memsys.h new file mode 100644 index 000000000..0a7b142a7 --- /dev/null +++ b/frame/base/bli_memsys.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MEMSYS_H +#define BLIS_MEMSYS_H + +// ----------------------------------------------------------------------------- + +membrk_t* bli_memsys_global_membrk( void ); + +// ----------------------------------------------------------------------------- + +void bli_memsys_init( void ); +void bli_memsys_reinit( cntx_t* cntx ); +void bli_memsys_finalize( void ); +bool_t bli_memsys_is_initialized( void ); + + +#endif + diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 226b0747a..e1f05d075 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -65,7 +65,6 @@ void bli_obj_create_without_buffer( num_t dt, obj_t* obj ) { siz_t elem_size; - mem_t* pack_mem; void* s; if ( bli_error_checking_is_enabled() ) @@ -98,9 +97,6 @@ void bli_obj_create_without_buffer( num_t dt, bli_obj_set_offs( 0, 0, *obj ); bli_obj_set_diag_offset( 0, *obj ); - pack_mem = bli_obj_pack_mem( *obj ); - bli_mem_set_buffer( NULL, pack_mem ); - // Set the internal scalar to 1.0. s = bli_obj_internal_scalar_buffer( *obj ); @@ -467,8 +463,6 @@ num_t bli_datatype_union( num_t dt1, num_t dt2 ) void bli_obj_print( char* label, obj_t* obj ) { FILE* file = stdout; - mem_t* pack_mem = bli_obj_pack_mem( *obj ); - //mem_t* cast_mem = bli_obj_cast_mem( *obj ); if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); @@ -491,10 +485,6 @@ void bli_obj_print( char* label, obj_t* obj ) fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ), ( signed long int )bli_obj_col_stride( *obj ) ); fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( *obj ) ); - fprintf( file, " pack_mem \n" ); - fprintf( file, " - buf %p\n", ( void* )bli_mem_buffer( pack_mem ) ); - fprintf( file, " - buf_type %lu\n", ( unsigned long int )bli_mem_buf_type( pack_mem ) ); - fprintf( file, " - size %lu\n", ( unsigned long int )bli_mem_size( pack_mem ) ); fprintf( file, " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) ); fprintf( file, " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) ); fprintf( file, " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) ); diff --git a/frame/base/old/bli_mem.c.prev b/frame/base/old/bli_mem.c.prev new file mode 100644 index 000000000..7a16e8732 --- /dev/null +++ b/frame/base/old/bli_mem.c.prev @@ -0,0 +1,366 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS +extern pthread_mutex_t mem_manager_mutex; +#endif + +// Declare one memory pool structure for each block size/shape we want to +// be able to allocate. + +static pool_t pools[3]; + + +// Physically contiguous memory for each pool. +// +// Generally speaking, the pool sizes are computed in a sub-header of blis.h +// as follows: +// +// BLIS_MK_POOL_SIZE = BLIS_MAXIMUM_MC_? * BLIS_MAXIMUM_KC_? * BLIS_SIZEOF_? +// +// where "?" is the datatype that results in the largest pool size. The +// constants BLIS_KN_POOL_SIZE and BLIS_MN_POOL_SIZE are computed in a +// similar manner. All constants are computed with appropriate "padding" +// to ensure enough space given the alignments required by bli_config.h. +// + +static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ]; +static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ]; +static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ]; + +#define BLIS_USE_HEAP + +#ifdef BLIS_USE_HEAP +static char* pool_mk_mem = NULL; +static char* pool_kn_mem = NULL; +static char* pool_mn_mem = NULL; +#else +static char pool_mk_mem[ BLIS_MK_POOL_SIZE ]; +static char pool_kn_mem[ BLIS_KN_POOL_SIZE ]; +static char pool_mn_mem[ BLIS_MN_POOL_SIZE ]; +#endif + + + +void bli_mem_acquire_m( siz_t req_size, + packbuf_t buf_type, + mem_t* mem ) +{ + siz_t block_size; + dim_t pool_index; + pool_t* pool; + void** block_ptrs; + void* block; + gint_t i; + + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffer requests, such as those used by level-2 + // operations, using bli_malloc() is sufficient, since using + // physically contiguous memory is not as important there. + block = bli_malloc( req_size ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), and + // - the size of the requested region. + // NOTE: We do not initialize the pool field since this block did not + // come from a contiguous memory pool. + bli_mem_set_buffer( block, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_size( req_size, mem ); + } + else + { + // This branch handles cases where the memory block needs to come + // from one of the contiguous memory pools. + + // Map the requested packed buffer type to a zero-based index, which + // we then use to select the corresponding memory pool. + pool_index = bli_packbuf_index( buf_type ); + pool = &pools[ pool_index ]; + + // Unconditionally perform error checking on the memory pool. + { + err_t e_val; + + // Make sure that the requested matrix size fits inside of a block + // of the corresponding pool. + e_val = bli_check_requested_block_size_for_pool( req_size, pool ); + bli_check_error_code( e_val ); + + // Make sure that the pool contains at least one block to check out + // to the thread. + e_val = bli_check_if_exhausted_pool( pool ); + bli_check_error_code( e_val ); + } + + // Access the block pointer array from the memory pool data structure. + block_ptrs = bli_pool_block_ptrs( pool ); + + + // BEGIN CRITICAL SECTION +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + { + + // Query the index of the contiguous memory block that resides at the + // "top" of the pool. + i = bli_pool_top_index( pool ); + + // Extract the address of the top block from the block pointer array. + block = block_ptrs[i]; + + // Clear the entry from the block pointer array. (This is actually not + // necessary.) + //block_ptrs[i] = NULL; + + // Decrement the top of the memory pool. + bli_pool_dec_top_index( pool ); + + + // END CRITICAL SECTION + } +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + + // Query the size of the blocks in the pool so we can store it in the + // mem_t object. + block_size = bli_pool_block_size( pool ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), + // - the address of the memory pool to which it belongs, and + // - the size of the contiguous memory block (NOT the size of the + // requested region). + bli_mem_set_buffer( block, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_pool( pool, mem ); + bli_mem_set_size( block_size, mem ); + } +} + + +void bli_mem_release( mem_t* mem ) +{ + packbuf_t buf_type; + pool_t* pool; + void** block_ptrs; + void* block; + gint_t i; + + // Extract the address of the memory block we are trying to + // release. + block = bli_mem_buffer( mem ); + + // Extract the buffer type so we know what kind of memory was allocated. + buf_type = bli_mem_buf_type( mem ); + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffers, we allocate with bli_malloc(), and so + // here we need to call bli_free(). + bli_free( block ); + } + else + { + // This branch handles cases where the memory block came from one + // of the contiguous memory pools. + + // Extract the pool from which the block was allocated. + pool = bli_mem_pool( mem ); + + // Extract the block pointer array associated with the pool. + block_ptrs = bli_pool_block_ptrs( pool ); + + + // BEGIN CRITICAL SECTION +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + { + + // Increment the top of the memory pool. + bli_pool_inc_top_index( pool ); + + // Query the newly incremented top index. + i = bli_pool_top_index( pool ); + + // Place the address of the block back onto the top of the memory pool. + block_ptrs[i] = block; + + + // END CRITICAL SECTION + } +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + } + + + // Clear the mem_t object so that it appears unallocated. We clear: + // - the buffer field, + // - the pool field, and + // - the size field. + // NOTE: We do not clear the buf_type field since there is no + // "uninitialized" value for packbuf_t. + bli_mem_set_buffer( NULL, mem ); + bli_mem_set_pool( NULL, mem ); + bli_mem_set_size( 0, mem ); +} + + +void bli_mem_acquire_v( siz_t req_size, + mem_t* mem ) +{ + bli_mem_acquire_m( req_size, + BLIS_BUFFER_FOR_GEN_USE, + mem ); +} + + + +void bli_mem_init() +{ + dim_t index_a; + dim_t index_b; + dim_t index_c; + +#ifdef BLIS_USE_HEAP + pool_mk_mem = bli_malloc( BLIS_MK_POOL_SIZE ); + pool_kn_mem = bli_malloc( BLIS_KN_POOL_SIZE ); + pool_mn_mem = bli_malloc( BLIS_MN_POOL_SIZE ); +#endif + + // Map each of the packbuf_t values to an index starting at zero. + index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + // Initialize contiguous memory pool for MC x KC blocks. + bli_mem_init_pool( pool_mk_mem, + BLIS_MK_BLOCK_SIZE, + BLIS_NUM_MC_X_KC_BLOCKS, + pool_mk_blk_ptrs, + &pools[ index_a ] ); + + // Initialize contiguous memory pool for KC x NC blocks. + bli_mem_init_pool( pool_kn_mem, + BLIS_KN_BLOCK_SIZE, + BLIS_NUM_KC_X_NC_BLOCKS, + pool_kn_blk_ptrs, + &pools[ index_b ] ); + + // Initialize contiguous memory pool for MC x NC blocks. + bli_mem_init_pool( pool_mn_mem, + BLIS_MN_BLOCK_SIZE, + BLIS_NUM_MC_X_NC_BLOCKS, + pool_mn_blk_ptrs, + &pools[ index_c ] ); +} + + +void bli_mem_init_pool( char* pool_mem, + siz_t block_size, + dim_t num_blocks, + void** block_ptrs, + pool_t* pool ) +{ + const siz_t align_size = BLIS_CONTIG_ADDR_ALIGN_SIZE; + dim_t i; + + // If the pool starting address is not already aligned, advance it + // accordingly. + if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) + { + // Notice that this works even if the alignment is not a power of two. + pool_mem += ( ( uintptr_t )align_size - + ( ( uintptr_t )pool_mem % align_size ) ); + } + + // Step through the memory pool, beginning with the aligned address + // determined above, assigning pointers to the beginning of each block_size + // bytes to the ith element of the block_ptrs array. + for ( i = 0; i < num_blocks; ++i ) + { + // Save the address of pool, which is guaranteed to be aligned. + block_ptrs[i] = pool_mem; + + // Advance pool by one block. + pool_mem += block_size; + + // Advance pool a bit further if needed in order to get to the + // beginning of an alignment boundary. + if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) + { + pool_mem += ( ( uintptr_t )align_size - + ( ( uintptr_t )pool_mem % align_size ) ); + } + } + + // Now that we have initialized the array of pointers to the individual + // blocks in the pool, we initialize a pool_t data structure so that we + // can easily manage this pool. + bli_pool_init( num_blocks, + block_size, + block_ptrs, + pool ); +} + + + +void bli_mem_finalize() +{ + // Nothing to do. + +#ifdef BLIS_USE_HEAP + bli_free( pool_mk_mem ); + bli_free( pool_kn_mem ); + bli_free( pool_mn_mem ); +#endif + +} + diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 9ac03de97..a50968845 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -46,6 +46,5 @@ extern obj_t BLIS_MINUS_TWO; extern thrcomm_t BLIS_SINGLE_COMM; extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; -extern thrinfo_t BLIS_HERK_SINGLE_THREADED; #endif diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 01cf44e79..d99be2345 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -120,14 +120,12 @@ #include "bli_gentfunc_macro_defs.h" #include "bli_gentprot_macro_defs.h" -#include "bli_mem_macro_defs.h" #include "bli_obj_macro_defs.h" #include "bli_param_macro_defs.h" #include "bli_complex_macro_defs.h" #include "bli_scalar_macro_defs.h" #include "bli_error_macro_defs.h" #include "bli_blas_macro_defs.h" -#include "bli_auxinfo_macro_defs.h" #endif diff --git a/frame/include/bli_mem_macro_defs.h b/frame/include/bli_mem_macro_defs.h deleted file mode 100644 index d0fe850cd..000000000 --- a/frame/include/bli_mem_macro_defs.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2016 Hewlett Packard Enterprise Development LP - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_MEM_MACRO_DEFS_H -#define BLIS_MEM_MACRO_DEFS_H - - -// Mem entry query - -#define bli_mem_pblk( mem_p ) \ -\ - ( &((mem_p)->pblk) ) - -#define bli_mem_buffer( mem_p ) \ -\ - ( bli_pblk_buf_align( bli_mem_pblk( mem_p ) ) ) - -#define bli_mem_buf_sys( mem_p ) \ -\ - ( bli_pblk_buf_sys( bli_mem_pblk( mem_p ) ) ) - -#define bli_mem_buf_type( mem_p ) \ -\ - ( (mem_p)->buf_type ) - -#define bli_mem_pool( mem_p ) \ -\ - ( (mem_p)->pool ) - -#define bli_mem_membrk( mem_p ) \ -\ - ( (mem_p)->membrk ) - -#define bli_mem_size( mem_p ) \ -\ - ( (mem_p)->size ) - -#define bli_mem_is_alloc( mem_p ) \ -\ - ( bli_mem_buffer( mem_p ) != NULL ) - -#define bli_mem_is_unalloc( mem_p ) \ -\ - ( bli_mem_buffer( mem_p ) == NULL ) - - -// Mem entry modification - -#define bli_mem_set_pblk( pblk_p, mem_p ) \ -{ \ - mem_p->pblk = *(pblk_p); \ -} - -#define bli_mem_set_buffer( buf0, mem_p ) \ -{ \ - bli_pblk_set_buf_align( buf0, &(mem_p->pblk) ); \ -} - -#define bli_mem_set_buf_sys( buf0, mem_p ) \ -{ \ - bli_pblk_set_buf_sys( buf0, &(mem_p->pblk) ); \ -} - -#define bli_mem_set_buf_type( buf_type0, mem_p ) \ -{ \ - (mem_p)->buf_type = buf_type0; \ -} - -#define bli_mem_set_pool( pool0, mem_p ) \ -{ \ - (mem_p)->pool = pool0; \ -} - -#define bli_mem_set_membrk( membrk0, mem_p ) \ -{ \ - (mem_p)->membrk = membrk0; \ -} - -#define bli_mem_set_size( size0, mem_p ) \ -{ \ - mem_p->size = size0; \ -} - -#define bli_mem_clear( mem_p ) \ -{ \ - bli_mem_set_buffer( NULL, mem_p ); \ - bli_mem_set_buf_sys( NULL, mem_p ); \ - bli_mem_set_pool( NULL, mem_p ); \ - bli_mem_set_size( 0, mem_p ); \ - bli_mem_set_membrk( NULL, mem_p ); \ -} - - -#endif diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 306c09544..0d5992900 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -812,21 +812,6 @@ bli_obj_width_stored( obj ) (obj).elem_size = size; \ } - -// Pack mem_t entry query - -#define bli_obj_pack_mem( obj ) \ -\ - ( &((obj).pack_mem) ) - -// Pack mem_t entry modification - -#define bli_obj_set_pack_mem( mem_p, obj ) \ -{ \ - (obj).pack_mem = *mem_p; \ -} - - // Packed matrix info query #define bli_obj_padded_length( obj ) \ @@ -839,6 +824,12 @@ bli_obj_width_stored( obj ) // Packed matrix info modification +#define bli_obj_set_buffer_to_mem( mem_p, obj ) \ +{ \ + void* buf = bli_mem_buffer( mem_p ); \ + bli_obj_set_buffer( buf, obj ); \ +} \ + #define bli_obj_set_padded_length( m0, obj ) \ { \ (obj).m_padded = m0; \ @@ -900,15 +891,7 @@ bli_obj_width_stored( obj ) // -- Miscellaneous object macros -- -// Make a special alias (shallow copy) that does not overwrite pack_mem -// entry. - -#define bli_obj_alias_for_packing( a, b ) \ -{ \ - bli_obj_init_basic_shallow_copy_of( a, b ); \ -} - -// Make a full alias (shallow copy), including pack_mem and friends +// Make a full alias (shallow copy) #define bli_obj_alias_to( a, b ) \ { \ @@ -948,28 +931,6 @@ bli_obj_width_stored( obj ) } -// Initialize object for packing purposes - -#define bli_obj_init_pack( obj_p ) \ -{ \ - mem_t* pack_mem_ = bli_obj_pack_mem( *obj_p ); \ -\ - bli_mem_set_buffer( NULL, pack_mem_ ); \ -} - - -// Release object's pack mem_t entries back to memory manager - -#define bli_obj_release_pack( obj_p ) \ -{ \ - mem_t* pack_mem_ = bli_obj_pack_mem( *(obj_p) ); \ -\ - if ( bli_mem_is_alloc( pack_mem_ ) ) \ - bli_membrk_release( pack_mem_ ); \ -} - - - // Submatrix/scalar buffer acquisition #define BLIS_CONSTANT_SLOT_SIZE BLIS_MAX_TYPE_SIZE diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index eb2312fee..086740cfd 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -510,234 +510,6 @@ typedef enum } dir_t; -// -// -- BLIS misc. structure types ----------------------------------------------- -// - -// -- Mutex type -- - -typedef struct mtx_s mtx_t; - -// -- Pool block type -- - -typedef struct -{ - void* buf_sys; - void* buf_align; -} pblk_t; - -// -- Pool type -- - -typedef struct -{ - pblk_t* block_ptrs; - dim_t block_ptrs_len; - - dim_t top_index; - dim_t num_blocks; - - siz_t block_size; - siz_t align_size; -} pool_t; - -// -- Memory broker object type -- - -typedef struct membrk_s membrk_t; -/* -{ - pool_t pools[3]; - mtx_t mutex; - - malloc_ft malloc_fp; - free_ft free_fp; -} membrk_t; -*/ - -// -- Memory object type -- - -typedef struct mem_s -{ - pblk_t pblk; - packbuf_t buf_type; - pool_t* pool; - membrk_t* membrk; - siz_t size; -} mem_t; - -// -- Blocksize object type -- - -typedef struct blksz_s -{ - // Primary blocksize values. - dim_t v[BLIS_NUM_FP_TYPES]; - - // Blocksize extensions. - dim_t e[BLIS_NUM_FP_TYPES]; - -} blksz_t; - -// -- Function pointer object type -- - -typedef struct func_s -{ - // Kernel function address. - void* ptr[BLIS_NUM_FP_TYPES]; - -} func_t; - -// -- Multi-boolean object type -- - -typedef struct mbool_s -{ - bool_t v[BLIS_NUM_FP_TYPES]; - -} mbool_t; - -// -- Auxiliary kernel info type -- - -// Note: This struct is used by macro-kernels to package together extra -// parameter values that may be of use to the micro-kernel without -// cluttering up the micro-kernel interface itself. - -typedef struct -{ - // The pack schemas of A and B. - pack_t schema_a; - pack_t schema_b; - - // Pointers to the micro-panels of A and B which will be used by the - // next call to the micro-kernel. - void* a_next; - void* b_next; - - // The imaginary strides of A and B. - inc_t is_a; - inc_t is_b; - -} auxinfo_t; - - - -// -// -- BLIS object type definitions --------------------------------------------- -// - -typedef struct obj_s -{ - // Basic fields - struct obj_s* root; - - dim_t off[2]; - dim_t dim[2]; - doff_t diag_off; - - objbits_t info; - siz_t elem_size; - - void* buffer; - inc_t rs; - inc_t cs; - inc_t is; - - // Bufferless scalar storage - atom_t scalar; - - // Pack-related fields - mem_t pack_mem; // cached memory region for packing - dim_t m_padded; // m dimension of matrix, including any padding - dim_t n_padded; // n dimension of matrix, including any padding - inc_t ps; // panel stride (distance to next panel) - inc_t pd; // panel dimension (the "width" of a panel: - // usually MR or NR) - dim_t m_panel; // m dimension of a "full" panel - dim_t n_panel; // n dimension of a "full" panel -} obj_t; - - -// Define these macros here since they must be updated if contents of -// obj_t changes. -#define bli_obj_init_basic_shallow_copy_of( a, b ) \ -{ \ - (b).root = (a).root; \ -\ - (b).off[0] = (a).off[0]; \ - (b).off[1] = (a).off[1]; \ - (b).dim[0] = (a).dim[0]; \ - (b).dim[1] = (a).dim[1]; \ - (b).diag_off = (a).diag_off; \ -\ - (b).info = (a).info; \ - (b).elem_size = (a).elem_size; \ -\ - (b).buffer = (a).buffer; \ - (b).rs = (a).rs; \ - (b).cs = (a).cs; \ - (b).is = (a).is; \ -\ - (b).scalar = (a).scalar; \ -\ - /* We must NOT copy pack_mem field since this macro forms the basis of - bli_obj_alias_to(), which is used in packm_init(). There, we want to - copy the basic fields of the obj_t but PRESERVE the pack_mem field - of the destination object since it holds the "cached" mem_t object - and buffer. The other fields, such as padded dimensions, are always - set by bli_packm_init(), so we don't need to copy them either. */ \ -} - -#define bli_obj_init_full_shallow_copy_of( a, b ) \ -{ \ - /* This macro implements a full alias (shallow copy) that copies all - fields of the obj_t struct. */ \ - bli_obj_init_basic_shallow_copy_of( a, b ); \ -\ - (b).pack_mem = (a).pack_mem; \ - (b).m_padded = (a).m_padded; \ - (b).n_padded = (a).n_padded; \ - (b).ps = (a).ps; \ - (b).pd = (a).pd; \ - (b).m_panel = (a).m_panel; \ - (b).n_panel = (a).n_panel; \ -} - -#define bli_obj_init_subpart_from( a, b ) \ -{ \ - (b).root = (a).root; \ -\ - (b).off[0] = (a).off[0]; \ - (b).off[1] = (a).off[1]; \ - /* Avoid copying m since it will be overwritten. */ \ - /* Avoid copying n since it will be overwritten. */ \ - (b).diag_off = (a).diag_off; \ -\ - (b).info = (a).info; \ - (b).elem_size = (a).elem_size; \ -\ - (b).buffer = (a).buffer; \ - (b).rs = (a).rs; \ - (b).cs = (a).cs; \ - (b).is = (a).is; \ -\ - (b).scalar = (a).scalar; \ -\ - /* We want to copy the pack_mem field here because this macro is used - when creating subpartitions, including those of packed objects. In - those situations, we want the subpartition to inherit the pack_mem - field of its parent, as well as other related fields such as the - padded dimensions. */ \ - (b).pack_mem = (a).pack_mem; \ - (b).m_padded = (a).m_padded; \ - (b).n_padded = (a).n_padded; \ - (b).pd = (a).pd; \ - (b).ps = (a).ps; \ - (b).m_panel = (a).m_panel; \ - (b).n_panel = (a).n_panel; \ -} - - -// -// -- Other BLIS enumerated type definitions ----------------------------------- -// - // -- Subpartition type -- typedef enum @@ -791,6 +563,7 @@ typedef enum #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 + // -- Induced method types -- typedef enum @@ -807,6 +580,7 @@ typedef enum #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) + // -- Kernel ID types -- typedef enum @@ -828,6 +602,7 @@ typedef enum #define BLIS_NUM_LEVEL1V_KERS 13 + typedef enum { BLIS_AXPY2V_KER = 0, @@ -839,6 +614,7 @@ typedef enum #define BLIS_NUM_LEVEL1F_KERS 5 + typedef enum { BLIS_GEMM_UKR = 0, @@ -850,6 +626,7 @@ typedef enum #define BLIS_NUM_LEVEL3_UKRS 5 + typedef enum { BLIS_REFERENCE_UKERNEL = 0, @@ -911,11 +688,245 @@ typedef enum BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_VF, // level-1v vector fusing factor + + BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 13 +// +// -- BLIS misc. structure types ----------------------------------------------- +// + +// -- Mutex type -- + +typedef struct mtx_s mtx_t; + +// -- Pool block type -- + +typedef struct +{ + void* buf_sys; + void* buf_align; +} pblk_t; + +// -- Pool type -- + +typedef struct +{ + pblk_t* block_ptrs; + dim_t block_ptrs_len; + + dim_t top_index; + dim_t num_blocks; + + siz_t block_size; + siz_t align_size; +} pool_t; + +// -- Memory broker object type -- + +typedef struct membrk_s membrk_t; +/* +{ + pool_t pools[3]; + mtx_t mutex; + + malloc_ft malloc_fp; + free_ft free_fp; +} membrk_t; +*/ + +// -- Memory object type -- + +typedef struct mem_s +{ + pblk_t pblk; + packbuf_t buf_type; + pool_t* pool; + membrk_t* membrk; + siz_t size; +} mem_t; + +// -- Control tree node type -- + +struct cntl_s +{ + // Basic fields (usually required). + bszid_t bszid; + void* var_func; + struct cntl_s* sub_node; + + // Optional fields (needed only by some operations such as packm). + // NOTE: first field of params must be a uint64_t containing the size + // of the struct. + void* params; + + // Internal fields that track "cached" data. + mem_t pack_mem; +}; +typedef struct cntl_s cntl_t; + + +// -- Blocksize object type -- + +typedef struct blksz_s +{ + // Primary blocksize values. + dim_t v[BLIS_NUM_FP_TYPES]; + + // Blocksize extensions. + dim_t e[BLIS_NUM_FP_TYPES]; + +} blksz_t; + + +// -- Function pointer object type -- + +typedef struct func_s +{ + // Kernel function address. + void* ptr[BLIS_NUM_FP_TYPES]; + +} func_t; + + +// -- Multi-boolean object type -- + +typedef struct mbool_s +{ + bool_t v[BLIS_NUM_FP_TYPES]; + +} mbool_t; + + +// -- Auxiliary kernel info type -- + +// Note: This struct is used by macro-kernels to package together extra +// parameter values that may be of use to the micro-kernel without +// cluttering up the micro-kernel interface itself. + +typedef struct +{ + // The pack schemas of A and B. + pack_t schema_a; + pack_t schema_b; + + // Pointers to the micro-panels of A and B which will be used by the + // next call to the micro-kernel. + void* a_next; + void* b_next; + + // The imaginary strides of A and B. + inc_t is_a; + inc_t is_b; + +} auxinfo_t; + + +// +// -- BLIS object type definitions --------------------------------------------- +// + +typedef struct obj_s +{ + // Basic fields + struct obj_s* root; + + dim_t off[2]; + dim_t dim[2]; + doff_t diag_off; + + objbits_t info; + siz_t elem_size; + + void* buffer; + inc_t rs; + inc_t cs; + inc_t is; + + // Bufferless scalar storage + atom_t scalar; + + // Pack-related fields + dim_t m_padded; // m dimension of matrix, including any padding + dim_t n_padded; // n dimension of matrix, including any padding + inc_t ps; // panel stride (distance to next panel) + inc_t pd; // panel dimension (the "width" of a panel: + // usually MR or NR) + dim_t m_panel; // m dimension of a "full" panel + dim_t n_panel; // n dimension of a "full" panel +} obj_t; + + +// Define these macros here since they must be updated if contents of +// obj_t changes. + +#define bli_obj_init_full_shallow_copy_of( a, b ) \ +{ \ + (b).root = (a).root; \ +\ + (b).off[0] = (a).off[0]; \ + (b).off[1] = (a).off[1]; \ + (b).dim[0] = (a).dim[0]; \ + (b).dim[1] = (a).dim[1]; \ + (b).diag_off = (a).diag_off; \ +\ + (b).info = (a).info; \ + (b).elem_size = (a).elem_size; \ +\ + (b).buffer = (a).buffer; \ + (b).rs = (a).rs; \ + (b).cs = (a).cs; \ + (b).is = (a).is; \ +\ + (b).scalar = (a).scalar; \ +\ + /*(b).pack_mem = (a).pack_mem;*/ \ + (b).m_padded = (a).m_padded; \ + (b).n_padded = (a).n_padded; \ + (b).ps = (a).ps; \ + (b).pd = (a).pd; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ +} + +#define bli_obj_init_subpart_from( a, b ) \ +{ \ + (b).root = (a).root; \ +\ + (b).off[0] = (a).off[0]; \ + (b).off[1] = (a).off[1]; \ + /* Avoid copying m since it will be overwritten. */ \ + /* Avoid copying n since it will be overwritten. */ \ + (b).diag_off = (a).diag_off; \ +\ + (b).info = (a).info; \ + (b).elem_size = (a).elem_size; \ +\ + (b).buffer = (a).buffer; \ + (b).rs = (a).rs; \ + (b).cs = (a).cs; \ + (b).is = (a).is; \ +\ + (b).scalar = (a).scalar; \ +\ + /* We want to copy the pack_mem field here because this macro is used + when creating subpartitions, including those of packed objects. In + those situations, we want the subpartition to inherit the pack_mem + field of its parent, as well as other related fields such as the + padded dimensions. */ \ + /*(b).pack_mem = (a).pack_mem;*/ \ + (b).m_padded = (a).m_padded; \ + (b).n_padded = (a).n_padded; \ + (b).pd = (a).pd; \ + (b).ps = (a).ps; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ +} + + // -- Context type -- typedef struct cntx_s @@ -932,6 +943,7 @@ typedef struct cntx_s func_t packm_ukrs; + opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; diff --git a/frame/include/blis.h b/frame/include/blis.h index 32fca0c71..0eaaf413f 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -106,6 +106,7 @@ extern "C" { #include "bli_ind.h" #include "bli_membrk.h" #include "bli_pool.h" +#include "bli_memsys.h" #include "bli_mem.h" #include "bli_part.h" #include "bli_prune.h" @@ -113,6 +114,7 @@ extern "C" { #include "bli_blksz.h" #include "bli_func.h" #include "bli_mbool.h" +#include "bli_auxinfo.h" #include "bli_param_map.h" #include "bli_clock.h" #include "bli_check.h" diff --git a/frame/ind/oapi/bli_l3_3m4m_oapi.c b/frame/ind/oapi/bli_l3_3m4m_oapi.c index 04f2259d2..40348e627 100644 --- a/frame/ind/oapi/bli_l3_3m4m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m_oapi.c @@ -34,12 +34,6 @@ #include "blis.h" -// Bring control trees into scope. -extern gemm_t* gemm_cntl; -extern trsm_t* trsm_l_cntl; -extern trsm_t* trsm_r_cntl; - - // -- gemm/her2k/syr2k --------------------------------------------------------- #undef GENFRONT @@ -81,10 +75,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, b, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( alpha, a, b, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -161,10 +154,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -239,10 +231,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( alpha, a, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -302,10 +293,9 @@ void PASTEMAC(opname,imeth) \ /* Prepare the context for the ith stage of computation. */ \ PASTEMAC2(cname,imeth,_cntx_stage)( i, cntx_p ); \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -353,11 +343,9 @@ void PASTEMAC(opname,imeth) \ /* NOTE: trsm cannot be implemented via any induced method that needs to execute in stages (e.g. 3mh, 4mh). */ \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, \ - PASTECH(cname,_l_cntl), \ - PASTECH(cname,_r_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -373,10 +361,3 @@ GENFRONT( trsm, trsm, 3m1, 1 ) //GENFRONT( trmm, trsm, 4mb, 1 ) // Unimplementable. GENFRONT( trsm, trsm, 4m1, 1 ) - -// -// ----------------------------------------------------------------------------- -// ----------------------------------------------------------------------------- -// ----------------------------------------------------------------------------- -// - diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 9038067c5..68b664d65 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -34,11 +34,6 @@ #include "blis.h" -// Bring control trees into scope. -extern gemm_t* gemm_cntl; -extern trsm_t* trsm_l_cntl; -extern trsm_t* trsm_r_cntl; - // NOTE: The function definitions in this file can be consolidated with the // definitions for the other induced methods. The only advantage of keeping // them separate is that it allows us to avoid the very small loop overhead @@ -69,8 +64,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - alpha, a, b, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + alpha, a, b, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -107,8 +101,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + side, alpha, a, b, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -143,8 +136,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - alpha, a, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + alpha, a, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -178,8 +170,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, cntx_p, \ - PASTECH(cname,_cntl) \ + side, alpha, a, b, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -212,9 +203,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, cntx_p, \ - PASTECH(cname,_l_cntl), \ - PASTECH(cname,_r_cntl) \ + side, alpha, a, b, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 04f0c34a8..7c1fe69f9 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -201,21 +201,27 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) void bli_l3_thread_decorator ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ) { _Pragma( "omp parallel num_threads(n_threads)" ) { - dim_t omp_id = omp_get_thread_num(); + dim_t omp_id = omp_get_thread_num(); + thrinfo_t* thread_i = thread[omp_id]; + + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); func ( @@ -225,9 +231,12 @@ void bli_l3_thread_decorator beta, c, cntx, - cntl, + cntl_use, thread[omp_id] ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); } } diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 42a9c6979..0f2707d91 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -80,7 +80,7 @@ void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) communicator->n_threads = n_threads; communicator->sense = 0; communicator->threads_arrived = 0; - + #ifdef BLIS_USE_PTHREAD_MUTEX pthread_mutex_init( &communicator->mutex, NULL ); #endif @@ -123,52 +123,71 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) #endif -void* thread_decorator_helper( void* data_void ); +void* bli_l3_thread_entry( void* data_void ); +// A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3_int_t func; - obj_t* alpha; - obj_t* a; - obj_t* b; - obj_t* beta; - obj_t* c; - void* cntx; - void* cntl; - void* thread; + l3int_t func; + obj_t* alpha; + obj_t* a; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + cntl_t* cntl; + thrinfo_t* thread; } thread_data_t; -void* thread_decorator_helper( void* data_void ) +// Entry point for additional threads +void* bli_l3_thread_entry( void* data_void ) { - thread_data_t* data = data_void; + thread_data_t* data = data_void; + + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + cntl_t* cntl = data->cntl; + thrinfo_t* thread_i = data->thread; + + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); data->func ( - data->alpha, - data->a, - data->b, - data->beta, - data->c, - data->cntx, - data->cntl, - data->thread + alpha, + a, + b, + beta, + c, + cntx, + cntl_use, + thread ); + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + return NULL; } void bli_l3_thread_decorator ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ) { pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); @@ -176,22 +195,38 @@ void bli_l3_thread_decorator for ( int i = 1; i < n_threads; i++ ) { - //Setup the thread data - datas[i].func = func; - datas[i].alpha = alpha; - datas[i].a = a; - datas[i].b = b; - datas[i].beta = beta; - datas[i].c = c; - datas[i].cntx = cntx; - datas[i].cntl = cntl; + // Set up thread data for additional threads (beyond thread 0). + datas[i].func = func; + datas[i].alpha = alpha; + datas[i].a = a; + datas[i].b = b; + datas[i].beta = beta; + datas[i].c = c; + datas[i].cntx = cntx; + datas[i].cntl = cntl; datas[i].thread = thread[i]; - pthread_create( &pthreads[i], NULL, &thread_decorator_helper, &datas[i] ); + // Spawn additional threads. + pthread_create( &pthreads[i], NULL, &bli_l3_thread_entry, &datas[i] ); } - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); + // The main thread executes this. + { + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + + // Thread 0 simply executes func. + func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread[0] ); + } + + + // Thread 0 waits for additional threads to finish. for ( int i = 1; i < n_threads; i++) { pthread_join( pthreads[i], NULL ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index fb2bc97bb..99de67220 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -36,24 +36,6 @@ #ifndef BLIS_ENABLE_MULTITHREADING -void bli_l3_thread_decorator - ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread - ) -{ - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); -} - - //Constructors and destructors for constructors thrcomm_t* bli_thrcomm_create( dim_t n_threads ) { @@ -89,5 +71,43 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) return; } +void bli_l3_thread_decorator + ( + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread + ) +{ + thrinfo_t* thread_i = thread[0]; + + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + cntl_use, + thread[0] + ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); +} + + #endif diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 1cbd6eefe..43f0eaf8b 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -38,7 +38,6 @@ static bool_t bli_thread_is_init = FALSE; thrinfo_t BLIS_PACKM_SINGLE_THREADED = {}; thrinfo_t BLIS_GEMM_SINGLE_THREADED = {}; -thrinfo_t BLIS_HERK_SINGLE_THREADED = {}; thrcomm_t BLIS_SINGLE_COMM = {}; // ----------------------------------------------------------------------------- @@ -51,7 +50,6 @@ void bli_thread_init( void ) bli_thrcomm_init( &BLIS_SINGLE_COMM, 1 ); bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); - bli_l3_thrinfo_init_single( &BLIS_HERK_SINGLE_THREADED ); // Mark API as initialized. bli_thread_is_init = TRUE; @@ -211,38 +209,6 @@ void bli_thread_get_range_sub } } -siz_t bli_thread_get_range_mdim - ( - dir_t direct, - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - if ( direct == BLIS_FWD ) - return bli_thread_get_range_t2b( thr, a, bmult, start, end ); - else - return bli_thread_get_range_b2t( thr, a, bmult, start, end ); -} - -siz_t bli_thread_get_range_ndim - ( - dir_t direct, - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - if ( direct == BLIS_FWD ) - return bli_thread_get_range_l2r( thr, a, bmult, start, end ); - else - return bli_thread_get_range_r2l( thr, a, bmult, start, end ); -} - siz_t bli_thread_get_range_l2r ( thrinfo_t* thr, @@ -669,36 +635,122 @@ siz_t bli_thread_get_range_weighted_sub return area; } -siz_t bli_thread_get_range_weighted_mdim +siz_t bli_thread_get_range_mdim ( dir_t direct, thrinfo_t* thr, obj_t* a, - blksz_t* bmult, + obj_t* b, + obj_t* c, + cntl_t* cntl, + cntx_t* cntx, dim_t* start, dim_t* end ) { - if ( direct == BLIS_FWD ) - return bli_thread_get_range_t2b( thr, a, bmult, start, end ); + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntx_get_family( cntx ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( *a ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + obj_t* x; + bool_t use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } + else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end ); + else + return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end ); + } else - return bli_thread_get_range_b2t( thr, a, bmult, start, end ); + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_t2b( thr, x, bmult, start, end ); + else + return bli_thread_get_range_b2t( thr, x, bmult, start, end ); + } } -siz_t bli_thread_get_range_weighted_ndim +siz_t bli_thread_get_range_ndim ( dir_t direct, thrinfo_t* thr, obj_t* a, - blksz_t* bmult, + obj_t* b, + obj_t* c, + cntl_t* cntl, + cntx_t* cntx, dim_t* start, dim_t* end ) { - if ( direct == BLIS_FWD ) - return bli_thread_get_range_l2r( thr, a, bmult, start, end ); + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntx_get_family( cntx ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( *b ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + obj_t* x; + bool_t use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } + else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end ); + else + return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end ); + } else - return bli_thread_get_range_r2l( thr, a, bmult, start, end ); + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_l2r( thr, x, bmult, start, end ); + else + return bli_thread_get_range_r2l( thr, x, bmult, start, end ); + } } siz_t bli_thread_get_range_weighted_l2r diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index eb7e615ec..10097c39e 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -90,15 +90,16 @@ siz_t PASTEMAC0( opname ) \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ - blksz_t* bmult, \ + obj_t* b, \ + obj_t* c, \ + cntl_t* cntl, \ + cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_get_range_mdim ) GENPROT( thread_get_range_ndim ) -GENPROT( thread_get_range_weighted_mdim ) -GENPROT( thread_get_range_weighted_ndim ) #undef GENPROT #define GENPROT( opname ) \ @@ -157,31 +158,31 @@ siz_t bli_thread_get_range_weighted_sub // Level-3 internal function type -typedef void (*l3_int_t) +typedef void (*l3int_t) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void* thread + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread ); // Level-3 thread decorator prototype void bli_l3_thread_decorator ( - dim_t num_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ); // Miscellaneous prototypes diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index e47006954..4cf55b3d4 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -42,9 +42,8 @@ thrinfo_t* bli_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ) { thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); @@ -55,9 +54,8 @@ thrinfo_t* bli_thrinfo_create ocomm, ocomm_id, icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + free_comms, + sub_node ); return thread; @@ -72,21 +70,19 @@ void bli_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ) { - thread->ocomm = ocomm; - thread->ocomm_id = ocomm_id; - thread->icomm = icomm; - thread->icomm_id = icomm_id; - thread->n_way = n_way; - thread->work_id = work_id; + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->free_comms = free_comms; - thread->opackm = opackm; - thread->ipackm = ipackm; - thread->sub_self = sub_self; + thread->sub_node = sub_node; } void bli_thrinfo_init_single @@ -101,37 +97,8 @@ void bli_thrinfo_init_single &BLIS_SINGLE_COMM, 0, 1, 0, - &BLIS_PACKM_SINGLE_THREADED, - &BLIS_PACKM_SINGLE_THREADED, + FALSE, thread ); } -#if 0 -void bli_thrinfo_free - ( - thrinfo_t* thread - ) -{ - if ( thread == NULL || - thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_HERK_SINGLE_THREADED || - thread == &BLIS_PACKM_SINGLE_THREADED - ) return; - - // Free Communicators - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( thread->ocomm ); - if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( thread->icomm ); - - // Free thrinfo chidren - bli_packm_thrinfo_free( thread->opackm ); - bli_packm_thrinfo_free( thread->ipackm ); - bli_l3_thrinfo_free( thread->sub_self ); - bli_free_intl( thread ); - - return; -} -#endif - diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 3f8a3112b..5ebb6609f 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -58,9 +58,13 @@ struct thrinfo_s // What we're working on. dim_t work_id; - struct thrinfo_s* opackm; - struct thrinfo_s* ipackm; - struct thrinfo_s* sub_self; + // When freeing, should the communicators in this node be freed? Usually, + // this is field is true, but when nodes are created that share the same + // communicators as other nodes (such as with packm nodes), this is set + // to false. + bool_t free_comms; + + struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; @@ -81,9 +85,11 @@ typedef struct thrinfo_s thrinfo_t; // Generic accessor macros for all thrinfo_t objects. // -#define bli_thrinfo_sub_opackm( t ) ( t->opackm ) -#define bli_thrinfo_sub_ipackm( t ) ( t->ipackm ) -#define bli_thrinfo_sub_self( t ) ( t->sub_self ) +#define bli_thrinfo_ocomm( t ) ( t->ocomm ) +#define bli_thrinfo_icomm( t ) ( t->icomm ) +#define bli_thrinfo_needs_free_comms( t ) ( t->free_comms ) + +#define bli_thrinfo_sub_node( t ) ( t->sub_node ) // // Prototypes for level-3 thrinfo functions not specific to any operation. @@ -97,9 +103,8 @@ thrinfo_t* bli_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ); void bli_thrinfo_init @@ -111,9 +116,8 @@ void bli_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ); void bli_thrinfo_init_single diff --git a/testsuite/input.general b/testsuite/input.general index b9940dac3..0bf9053bd 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -36,6 +36,7 @@ sdcz # Datatype(s) to test: 1 # 4mh ('1' = enable; '0' = disable) 1 # 4m1b ('1' = enable; '0' = disable) 1 # 4m1a ('1' = enable; '0' = disable) +1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking i # Reaction to test failure: diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 0bb3c4440..514fdf66a 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -220,30 +220,28 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_KR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_KR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); - - // Pack the contents of a and b to ap and bp, respectively. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_KR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_KR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -269,9 +267,10 @@ void libblis_test_gemm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index c74d47d60..afd436d7f 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -260,39 +260,34 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); - - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_MR, - BLIS_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); - - // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - - // Pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_MR, + BLIS_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _make_subparts() routine needs this information // to know how to initialize the subpartitions. bli_obj_set_uplo( uploa, ap ); - // Create subpartitions from the a and b panels. bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &a1xp, &a11p, &bx1p, &b11p ); @@ -302,14 +297,13 @@ void libblis_test_gemmtrsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, a11p ); - // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); time = bli_clock(); @@ -331,9 +325,10 @@ void libblis_test_gemmtrsm_ukr_experiment // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index b86772361..e22bb52df 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -425,7 +425,9 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_4M1A ]) ); - params->ind_enable[ BLIS_NAT ] = 1; + // Read whether to native (complex) execution. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_NAT ]) ); // Read the requested error-checking level. libblis_test_read_next_line( buffer, input_stream ); @@ -943,7 +945,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "problem size: first to test %u\n", params->p_first ); libblis_test_fprintf_c( os, "problem size: max to test %u\n", params->p_max ); libblis_test_fprintf_c( os, "problem size increment %u\n", params->p_inc ); - libblis_test_fprintf_c( os, "test induced complex \n" ); + libblis_test_fprintf_c( os, "complex implementations \n" ); libblis_test_fprintf_c( os, " 3mh? %u\n", params->ind_enable[ BLIS_3MH ] ); libblis_test_fprintf_c( os, " 3m3? %u\n", params->ind_enable[ BLIS_3M3 ] ); libblis_test_fprintf_c( os, " 3m2? %u\n", params->ind_enable[ BLIS_3M2 ] ); @@ -951,7 +953,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, " 4mh? %u\n", params->ind_enable[ BLIS_4MH ] ); libblis_test_fprintf_c( os, " 4m1b (4mb)? %u\n", params->ind_enable[ BLIS_4M1B ] ); libblis_test_fprintf_c( os, " 4m1a (4m1)? %u\n", params->ind_enable[ BLIS_4M1A ] ); - libblis_test_fprintf_c( os, "test native complex? %u\n", params->ind_enable[ BLIS_NAT ] ); + libblis_test_fprintf_c( os, " native? %u\n", params->ind_enable[ BLIS_NAT ] ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); libblis_test_fprintf_c( os, "output in matlab format? %u\n", params->output_matlab_format ); @@ -1503,12 +1505,12 @@ void libblis_test_op_driver( test_params_t* params, // Loop over induced methods (or just BLIS_NAT). for ( indi = ind_first; indi <= ind_last; ++indi ) { - // If the current induced method is native execution, OR - // if the current induced method is implemented (for the - // operation being tested) AND it was requested, then we - // enable ONLY that method and proceed. Otherwise, we - // skip the current method and go to the next method. - if ( indi == BLIS_NAT ) { ; } + // If the current datatype is real, OR if the current + // induced method is implemented (for the operation + // being tested) AND it was requested, then we enable + // ONLY that method and proceed. Otherwise, we skip the + // current method and go to the next method. + if ( bli_is_real( datatype ) ) { ; } else if ( bli_ind_oper_is_impl( op->opid, indi ) && params->ind_enable[ indi ] == 1 ) { ; } else { continue; } @@ -1875,22 +1877,34 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c -void libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) { - // Start with making p and alias to a. - bli_obj_alias_to( *a, *p ); + bool_t does_inv_diag; - // Then initialize p appropriately for packing. - bli_packm_init_pack( inv_diag, - pack_schema, - BLIS_PACK_FWD_IF_UPPER, - BLIS_PACK_FWD_IF_LOWER, - pack_buf, - bmult_id_m, - bmult_id_n, - a, - p, - cntx ); + if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; + else does_inv_diag = TRUE; + + // Create a control tree node for the packing operation. + cntl_t* cntl = bli_packm_cntl_obj_create + ( + NULL, // func ptr is not referenced b/c we don't call via l3 _int(). + bli_packm_blk_var1, + bmult_id_m, + bmult_id_n, + does_inv_diag, + FALSE, + FALSE, + pack_schema, + pack_buf, + NULL // no child node needed + ); + + // Pack the contents of A to P. + bli_l3_packm( a, p, cntx, cntl, &BLIS_PACKM_SINGLE_THREADED ); + + // Return the control tree pointer so the caller can free the cntl_t and its + // mem_t entry later on. + return cntl; } diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index fab7c1a05..07ffcd106 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -382,7 +382,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -void libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 200a6d1a8..bf5f2d6bd 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -221,40 +221,39 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_MR, - BLIS_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_MR, + BLIS_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _ukernel() wrapper needs this information to // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, ap ); - // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - - // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); bli_copym( &c_save, &c ); @@ -277,9 +276,10 @@ void libblis_test_trsm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); From abd61f9fa75d77a96d1491b3e035451ee73238fe Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 30 Aug 2016 12:34:19 -0500 Subject: [PATCH 07/27] Updated BLIS4 TOMS citation in README.md. --- README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7142a1329..9fd5c6146 100644 --- a/README.md +++ b/README.md @@ -262,11 +262,17 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an ``` @article{BLIS4, - author = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and Enrique S. Quintana-Ort\'{\i}}, - title = {Analytical Models for the {BLIS} Framework}, + author = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and + Enrique S. Quintana-Ort\'{\i}}, + title = {Analytical Modeling Is Enough for High-Performance {BLIS}}, journal = {ACM Transactions on Mathematical Software}, - year = 2016, - note = {Accepted}, + volume = {43}, + number = {2}, + pages = {12:1--12:18}, + month = aug, + year = {2016}, + issue_date = {August 2016}, + url = {http://doi.acm.org/10.1145/2925987}, } ``` From 35509818cbea1598b123421f81c42120889a03c3 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 31 Aug 2016 17:34:15 -0500 Subject: [PATCH 08/27] Added, moved some thread barriers. Details: - Removed thread barriers from the end of the loop bodies of bli_gemm_blk_var1(), bli_gemm_blk_var2(), bli_trsm_blk_var1(), and bli_trsm_blk_var2(). - Moved the thread barrier at the end of bli_packm_int() to the end of bli_l3_packm(), and added missing barriers to that function. - Removed the no longer necessary (and now incorrect) ochief guard in bli_gemm3m3_packa() on the bli_obj_scalar_reset() on C. - Thanks to Tyler Smith for help with these changes. --- frame/1m/packm/bli_packm_int.c | 3 --- frame/3/bli_l3_packm.c | 8 ++++++++ frame/3/gemm/bli_gemm_blk_var1.c | 2 -- frame/3/gemm/bli_gemm_blk_var2.c | 2 -- frame/3/gemm/ind/bli_gemm3m3_packa.c | 2 +- frame/3/trsm/bli_trsm_blk_var1.c | 2 -- frame/3/trsm/bli_trsm_blk_var2.c | 2 -- 7 files changed, 9 insertions(+), 12 deletions(-) diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index d36919c33..22ce70a44 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -100,8 +100,5 @@ void bli_packm_int cntl, thread ); - - // Barrier so that packing is done before computation. - bli_thread_obarrier( thread ); } diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index 6714022db..28fb1f857 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -48,6 +48,9 @@ void bli_l3_packm mem_t* cntl_mem_p; siz_t size_needed; + // FGVZ: Not sure why we need this barrier, but we do. + bli_thread_obarrier( thread ); + // Every thread initializes x_pack and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t // entry in the control tree node). @@ -148,6 +151,8 @@ void bli_l3_packm // then we use it as-is. No action is needed, because all threads // will already have the cached values in their local control // trees' mem_t entries, currently pointed to by cntl_mem_p. + + bli_thread_obarrier( thread ); } } @@ -167,5 +172,8 @@ void bli_l3_packm cntl, thread ); + + // Barrier so that packing is done before computation. + bli_thread_obarrier( thread ); } diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 817e48cee..1a5693d8c 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -90,8 +90,6 @@ void bli_gemm_blk_var1 bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); - - bli_thread_ibarrier( thread ); } } diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index 0fceae6e6..a65f8a20a 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -90,8 +90,6 @@ void bli_gemm_blk_var2 bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); - - bli_thread_ibarrier( thread ); } } diff --git a/frame/3/gemm/ind/bli_gemm3m3_packa.c b/frame/3/gemm/ind/bli_gemm3m3_packa.c index f6e92020c..516047213 100644 --- a/frame/3/gemm/ind/bli_gemm3m3_packa.c +++ b/frame/3/gemm/ind/bli_gemm3m3_packa.c @@ -80,7 +80,7 @@ void bli_gemm3m3_packa ); // Only apply beta within the first of three subproblems. - if ( bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c ); + bli_obj_scalar_reset( c ); // ----------------------------------------------------- diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 1634efa0c..a731d8265 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -90,8 +90,6 @@ void bli_trsm_blk_var1 bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); - - bli_thread_ibarrier( thread ); } } diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index c2ca6b3ed..a133f0bb0 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -90,8 +90,6 @@ void bli_trsm_blk_var2 bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); - - bli_thread_ibarrier( thread ); } } From 121c39d455f2db6f7ce6802ba7f73ad5e088c68c Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 5 Sep 2016 13:11:42 -0500 Subject: [PATCH 09/27] Added complex gemm micro-kernels for haswell. Details: - Defined cgemm (3x8) and zgemm (3x4) micro-kernels for haswell-based architectures. As with their real domain brethren, these kernels perfer row storage, (though this doesn't affect most users due to high-level optimizations in most level-3 operations that induce a transpose to whatever storage preference the kernel may have). --- config/haswell/bli_kernel.h | 50 +- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 1028 +++++++++++++++++- 2 files changed, 1056 insertions(+), 22 deletions(-) diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 24a3c68b1..d23a00a5d 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -49,26 +49,27 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // -#if 0 +// sgemm micro-kernel +#if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_24x4 #define BLIS_DEFAULT_MC_S 264 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 24 #define BLIS_DEFAULT_NR_S 4 +#endif -#else - -/* +#if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 -*/ +#endif +#if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 @@ -77,29 +78,29 @@ #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS - #endif -#if 0 +// dgemm micro-kernel +#if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_12x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 192 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 12 #define BLIS_DEFAULT_NR_D 4 +#endif -#else - -/* +#if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 -*/ +#endif +#if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 @@ -108,10 +109,33 @@ #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS - - #endif +// cgemm micro-kernel + +#if 1 +#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 +#define BLIS_DEFAULT_MC_C 144 +#define BLIS_DEFAULT_KC_C 256 +#define BLIS_DEFAULT_NC_C 4080 +#define BLIS_DEFAULT_MR_C 3 +#define BLIS_DEFAULT_NR_C 8 + +#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + +// zgemm micro-kernel + +#if 1 +#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 +#define BLIS_DEFAULT_MC_Z 72 +#define BLIS_DEFAULT_KC_Z 256 +#define BLIS_DEFAULT_NC_Z 4080 +#define BLIS_DEFAULT_MR_Z 3 +#define BLIS_DEFAULT_NR_Z 4 + +#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index cb6097fe2..bee1df996 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -249,7 +249,7 @@ void bli_sgemm_asm_6x16 " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" + "prefetcht0 64 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" @@ -622,6 +622,8 @@ void bli_sgemm_asm_6x16 } + + #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ @@ -1197,9 +1199,41 @@ void bli_dgemm_asm_6x8 ); } -#if 0 -void bli_cgemm_asm_ + + +// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. +// outputs to ymm0 +#define CGEMM_INPUT_SCALE_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm3, %%xmm3 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm3, %%xmm3 \n\t" \ + "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + +// assumes values to output are in ymm0 +#define CGEMM_OUTPUT_GS \ + "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi,1) \n\t" \ + "vmovlpd %%xmm3, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm3, (%%rcx,%%r13 ) \n\t" + +#define CGEMM_INPUT_SCALE_RS_BETA_NZ \ + "vmovups (%%rcx), %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + +#define CGEMM_OUTPUT_RS \ + "vmovups %%ymm0, (%%rcx) \n\t" \ + +void bli_cgemm_asm_3x8 ( dim_t k, scomplex* restrict alpha, @@ -1214,14 +1248,515 @@ void bli_cgemm_asm_ //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - //dim_t k_iter = k / 4; - //dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(scomplex) + " \n\t" + "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; + "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; + " \n\t" + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".CLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 38 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 3 * 8, %%rax \n\t" // a += 4*3 (unroll x mr) + "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".CLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 3 * 8, %%rax \n\t" // a += 1*3 (unroll x mr) + "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".CPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" // permute even and odd elements + " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 + "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" + "vpermilps $0xb1, %%ymm7, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm11 \n\t" + "vpermilps $0xb1, %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" // subtract/add even/odd elements + "vaddsubps %%ymm6, %%ymm4, %%ymm4 \n\t" + "vaddsubps %%ymm7, %%ymm5, %%ymm5 \n\t" + " \n\t" + "vaddsubps %%ymm10, %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vaddsubps %%ymm14, %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm15, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate + "vbroadcastss 4(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm4, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm4, %%ymm4 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm5, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm5, %%ymm5 \n\t" + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm8, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm9, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm9, %%ymm9 \n\t" + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastss (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate + "vbroadcastss 4(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(scomplex) + "leaq (,%%rsi,4), %%rdx \n\t" // rdx = 4*cs_c; + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomiss %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. + "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); + "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. + "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); + "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. + "jne .CBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORED: \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CROWSTORED: \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CBETAZERO: \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm8, (%%r11) \n\t" + "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm12, (%%r12) \n\t" + "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } -void bli_zgemm_asm_ + +// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. +// outputs to ymm0 +#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ + "vmovupd (%%rcx), %%xmm0 \n\t" \ + "vmovupd (%%rcx,%%rsi), %%xmm3 \n\t" \ + "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + +// assumes values to output are in ymm0 +#define ZGEMM_OUTPUT_GS \ + "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ + "vmovupd %%xmm0, (%%rcx) \n\t" \ + "vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \ + +#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ + "vmovups (%%rcx), %%ymm0 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + +#define ZGEMM_OUTPUT_RS \ + "vmovupd %%ymm0, (%%rcx) \n\t" \ + +void bli_zgemm_asm_3x4 ( dim_t k, dcomplex* restrict alpha, @@ -1236,9 +1771,484 @@ void bli_zgemm_asm_ //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - //dim_t k_iter = k / 4; - //dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + //uint64_t alpha_is_unit = bli_zeq1( *alpha ); + + + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(dcomplex) + "leaq (,%%rdi,2), %%rdi \n\t" + " \n\t" + "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; + "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; + " \n\t" + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".ZLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 32 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 38 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 3 * 16, %%rax \n\t" // a += 4*3 (unroll x mr) + "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .ZLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".ZCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".ZLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 32 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 3 * 16, %%rax \n\t" // a += 1*3 (unroll x mr) + "addq $1 * 4 * 16, %%rbx \n\t" // b += 1*4 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".ZPOSTACCUM: \n\t" + " \n\t" + " \n\t" // permute even and odd elements + " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 + "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5, %%ymm11, %%ymm11 \n\t" + "vpermilpd $0x5, %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" // subtract/add even/odd elements + "vaddsubpd %%ymm6, %%ymm4, %%ymm4 \n\t" + "vaddsubpd %%ymm7, %%ymm5, %%ymm5 \n\t" + " \n\t" + "vaddsubpd %%ymm10, %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vaddsubpd %%ymm14, %%ymm12, %%ymm12 \n\t" + "vaddsubpd %%ymm15, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate + "vbroadcastsd 8(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm4, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm4, %%ymm4 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm5, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm8, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm9, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm12, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm13, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate + "vbroadcastsd 8(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(dcomplex) + "leaq (,%%rsi,2), %%rsi \n\t" + "leaq (,%%rsi,2), %%rdx \n\t" // rdx = 2*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. + "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); + "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. + "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); + "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. + "jne .ZBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".ZGENSTORED: \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZROWSTORED: \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZBETAZERO: \n\t" + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".ZGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm8, (%%r11) \n\t" + "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm12, (%%r12) \n\t" + "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".ZDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } -#endif From c0630c4024b08750043a2942a3e8a037aa6b6259 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 12 Sep 2016 13:59:02 -0500 Subject: [PATCH 10/27] Added debugging printf()'s to bli_l3_thrinfo.c. Details: - Added optional printf() statements to print out thread communicator info as the thrinfo_t structure is built in bli_l3_thrinfo.c. - Minor changes to frame/thread/bli_thrinfo.h. --- frame/3/bli_l3_thrinfo.c | 105 +++++++++---------------------------- frame/thread/bli_thrinfo.h | 41 ++++++++------- 2 files changed, 49 insertions(+), 97 deletions(-) diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 2505d37a4..36b65b52b 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -124,6 +124,8 @@ void bli_l3_thrinfo_free // ----------------------------------------------------------------------------- +//#define PRINT_THRINFO + thrinfo_t** bli_l3_thrinfo_create_paths ( opid_t l3_op, @@ -211,6 +213,16 @@ thrinfo_t** bli_l3_thrinfo_create_paths dim_t jr_nt = ir_way; dim_t ir_nt = 1; +#ifdef PRINT_THRINFO +printf( " jc kc ic jr ir\n" ); +printf( "xx_way: %4lu %4lu %4lu %4lu %4lu\n", + jc_way, kc_way, ic_way, jr_way, ir_way ); +printf( "\n" ); +printf( " gl jc kc ic jr ir\n" ); +printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu\n", +global_num_threads, jc_nt, kc_nt, ic_nt, jr_nt, ir_nt ); +printf( "=======================================\n" ); +#endif thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); @@ -235,86 +247,7 @@ thrinfo_t** bli_l3_thrinfo_create_paths for( int e = 0; e < ir_way; e++ ) { thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); -#if 0 - // Macrokernel loops - thrinfo_t* ir_info - = - bli_l3_thrinfo_create( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL ); - thrinfo_t* jr_info - = - bli_l3_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info ); - //blk_var_1 - thrinfo_t* pack_ic_in - = - bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - thrinfo_t* pack_ic_out - = - bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info - = - bli_l3_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info ); - //blk_var_3 - thrinfo_t* pack_kc_in - = - bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* pack_kc_out - = - bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info - = - bli_l3_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info ); - //blk_var_2 - thrinfo_t* pack_jc_in - = - bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* pack_jc_out - = - bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info - = - bli_l3_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info ); -// assume ic = 2; jr = 4 - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; = 1*2*4*1 - dim_t kc_nt = ic_way * jr_way * ir_way; = 2*4*1 - dim_t ic_nt = jr_way * ir_way; = 4*1 - dim_t jr_nt = ir_way; = 1 - dim_t ir_nt = 1; -#endif dim_t ir_comm_id = 0; dim_t jr_comm_id = e*ir_nt + ir_comm_id; dim_t ic_comm_id = d*jr_nt + jr_comm_id; @@ -372,11 +305,25 @@ thrinfo_t** bli_l3_thrinfo_create_paths kc_info ); paths[global_comm_id] = jc_info; + +#ifdef PRINT_THRINFO +printf( " gl jc kc ic jr ir\n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu\n", +global_comm_id, jc_comm_id, kc_comm_id, ic_comm_id, jr_comm_id, ir_comm_id ); +//printf( " a b c d e\n" ); +printf( "work ids: %4ld %4ld %4ld %4ld %4ld\n", (long int)a, (long int)b, (long int)c, (long int)d, (long int)e ); +printf( "---------------------------------------\n" ); +#endif + } } } } } +#ifdef PRINT_THRINFO +exit(1); +#endif + return paths; } diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 5ebb6609f..9c0b28575 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -68,28 +68,33 @@ struct thrinfo_s }; typedef struct thrinfo_s thrinfo_t; - -#define bli_thread_num_threads( t ) ( t->ocomm->n_threads ) - -#define bli_thread_n_way( t ) ( t->n_way ) -#define bli_thread_work_id( t ) ( t->work_id ) -#define bli_thread_am_ochief( t ) ( t->ocomm_id == 0 ) -#define bli_thread_am_ichief( t ) ( t->icomm_id == 0 ) - -#define bli_thread_obroadcast( t, ptr ) bli_thrcomm_bcast( t->ocomm, t->ocomm_id, ptr ) -#define bli_thread_ibroadcast( t, ptr ) bli_thrcomm_bcast( t->icomm, t->icomm_id, ptr ) -#define bli_thread_obarrier( t ) bli_thrcomm_barrier( t->ocomm, t->ocomm_id ) -#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( t->icomm, t->icomm_id ) - // -// Generic accessor macros for all thrinfo_t objects. +// thrinfo_t macros +// NOTE: The naming of these should be made consistent at some point. // -#define bli_thrinfo_ocomm( t ) ( t->ocomm ) -#define bli_thrinfo_icomm( t ) ( t->icomm ) -#define bli_thrinfo_needs_free_comms( t ) ( t->free_comms ) +#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads ) -#define bli_thrinfo_sub_node( t ) ( t->sub_node ) +#define bli_thread_n_way( t ) ( (t)->n_way ) +#define bli_thread_work_id( t ) ( (t)->work_id ) + +#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 ) +#define bli_thread_am_ichief( t ) ( (t)->icomm_id == 0 ) + +#define bli_thread_obroadcast( t, p ) bli_thrcomm_bcast( (t)->ocomm, \ + (t)->ocomm_id, p ) +#define bli_thread_ibroadcast( t, p ) bli_thrcomm_bcast( (t)->icomm, \ + (t)->icomm_id, p ) +#define bli_thread_obarrier( t ) bli_thrcomm_barrier( (t)->ocomm, \ + (t)->ocomm_id ) +#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( (t)->icomm, \ + (t)->icomm_id ) + +#define bli_thrinfo_ocomm( t ) ( (t)->ocomm ) +#define bli_thrinfo_icomm( t ) ( (t)->icomm ) +#define bli_thrinfo_needs_free_comms( t ) ( (t)->free_comms ) + +#define bli_thrinfo_sub_node( t ) ( (t)->sub_node ) // // Prototypes for level-3 thrinfo functions not specific to any operation. From e1453f68f6afd90ae9a29b7a5faa46aa79bbf741 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Fri, 16 Sep 2016 09:29:28 -0500 Subject: [PATCH 11/27] Fixes broken URL in README.md --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9fd5c6146..9bfa84285 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,17 @@ Introduction ------------ BLIS is a portable software framework for instantiating high-performance -BLAS-like dense linear algebra libraries. The framework was designed to -isolate essential kernels of computation that, when optimized, immediately -enable optimized implementations of most of its commonly used and -computationally intensive operations. BLIS is written in [ISO +BLAS-like dense linear algebra libraries. The framework was designed to isolate +essential kernels of computation that, when optimized, immediately enable +optimized implementations of most of its commonly used and computationally +intensive operations. BLIS is written in [ISO C99](http://en.wikipedia.org/wiki/C99) and available under a [new/modified/3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause). While BLIS exports a -[new BLAS-like API](), it also includes a BLAS compatibility layer which gives -application developers access to BLIS implementations via traditional [BLAS -routine calls](http://www.netlib.org/lapack/lug/node145.html). +[new BLAS-like API](https://github.com/flame/blis/wiki/BLISAPIQuickReference), +it also includes a BLAS compatibility layer which gives application developers +access to BLIS implementations via traditional [BLAS routine +calls](http://www.netlib.org/lapack/lug/node145.html). For a thorough presentation of our framework, please read our recently accepted journal article, ["BLIS: A Framework for Rapidly Instantiating BLAS From 7f32dd57c6bd41c0704341752842277dd6a4c8eb Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Sat, 17 Sep 2016 11:33:57 -0500 Subject: [PATCH 12/27] Adds sanity check to configuration choice. --- configure | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/configure b/configure index e0dc82c89..79d99f7ac 100755 --- a/configure +++ b/configure @@ -424,6 +424,14 @@ main() echo "${script_name}: manual configuration requested." config_name=$1 + + # Ensure configuration is valid. + if [ ! -d "${config_dirpath}/${config_name}" ]; then + echo "${script_name}: " + echo "${script_name}: *** configuration '${config_name}' does not exist. ***" + echo "${script_name}: " + exit 1; + fi fi echo "${script_name}: configuring with '${config_name}' configuration sub-directory." From fd04869ae4d4a3b0ebb9052557c296456bce7c0d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 27 Sep 2016 14:14:11 -0500 Subject: [PATCH 13/27] Changed configure's 'omp' threading to 'openmp'. Details: - Changed the configure script so that the expected string argument to the -t (or --enable-threading=) option that enables OpenMP multithreading is 'openmp'. The previous expected string, 'omp', is still supported but should be considered deprecated. --- configure | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/configure b/configure index e0dc82c89..320f16e35 100755 --- a/configure +++ b/configure @@ -91,7 +91,7 @@ print_usage() echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" echo " " echo " Enable threading in the library, using threading model" - echo " MODEL={omp,pthreads,no}. If MODEL=no or " + echo " MODEL={openmp,pthreads,no}. If MODEL=no or " echo " --disable-threading is specified, threading will be" echo " disabled. The default is 'no'." echo " " @@ -486,13 +486,15 @@ main() # Check the threading model flag. + # NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred. enable_openmp='no' enable_openmp_01=0 enable_pthreads='no' enable_pthreads_01=0 if [ "x${threading_model}" = "xauto" ]; then echo "${script_name}: determining the threading model automatically." - elif [ "x${threading_model}" = "xomp" ]; then + elif [ "x${threading_model}" = "xopenmp" ] || + [ "x${threading_model}" = "xomp" ]; then echo "${script_name}: using OpenMP for threading." enable_openmp='yes' enable_openmp_01=1 From 8d55033c966feed99fcca2a58017c3ab5b1646dc Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 27 Sep 2016 15:20:58 -0500 Subject: [PATCH 14/27] Implemented distributed thrinfo_t management. Details: - Implemented Ricardo Magana's distributed thread info/communicator management. Rather that fully construct the thrinfo_t structures, from root to leaf, prior to spawning threads, the threads individually construct their thrinfo_t trees (or, chains), and do so incrementally, as needed, reusing the same structure nodes during subsequent blocked variant iterations. This required moving the initial creation of the thrinfo_t structure (now, the root nodes) from the _front() functions to the bli_l3_thread_decorator(). The incremental "growing" of the tree is performed in the internal back-end (ie: _int()) function, and so mostly invisible. Also, the incremental growth of the thrinfo_t tree is done as a function of the current and parent control tree nodes (as well as the parent thrinfo_t node), further reinforcing the parallel relationship between the two data structures. - Removed the "inner" communicator from thrinfo_t structure definition, as well as its id. Changed all APIs accordingly. Renamed bli_thrinfo_needs_free_comms() to bli_thrinfo_needs_free_comm(). - Defined bli_l3_thrinfo_print_paths(), which prints the information in an array of thrinfo_t* structure pointers. (Used only as a debugging/verification tool.) - Deprecated the following thrinfo_t creation functions: bli_packm_thrinfo_create() bli_l3_thrinfo_create() because they are no longer used. bli_thrinfo_create() is now called directly when creating thrinfo_t nodes. --- frame/1m/packm/bli_packm_thrinfo.c | 12 +- frame/1m/packm/bli_packm_thrinfo.h | 8 +- frame/3/bli_l3_thrinfo.c | 356 ++++++++++++++++++---------- frame/3/bli_l3_thrinfo.h | 38 ++- frame/3/gemm/bli_gemm_blk_var3.c | 4 +- frame/3/gemm/bli_gemm_cntl.c | 17 +- frame/3/gemm/bli_gemm_front.c | 22 +- frame/3/gemm/bli_gemm_int.c | 18 +- frame/3/hemm/bli_hemm_front.c | 10 +- frame/3/her2k/bli_her2k_front.c | 19 +- frame/3/herk/bli_herk_front.c | 10 +- frame/3/symm/bli_symm_front.c | 10 +- frame/3/syr2k/bli_syr2k_front.c | 15 +- frame/3/syrk/bli_syrk_front.c | 10 +- frame/3/trmm/bli_trmm_front.c | 10 +- frame/3/trmm3/bli_trmm3_front.c | 10 +- frame/3/trsm/bli_trsm_blk_var3.c | 3 +- frame/3/trsm/bli_trsm_cntl.c | 34 ++- frame/3/trsm/bli_trsm_front.c | 10 +- frame/3/trsm/bli_trsm_int.c | 3 + frame/base/bli_cntx.c | 121 ++++++++++ frame/base/bli_cntx.h | 47 ++++ frame/include/bli_type_defs.h | 17 ++ frame/thread/bli_thrcomm.h | 6 + frame/thread/bli_thrcomm_openmp.c | 28 ++- frame/thread/bli_thrcomm_pthreads.c | 83 ++++--- frame/thread/bli_thrcomm_single.c | 26 +- frame/thread/bli_thread.c | 8 +- frame/thread/bli_thread.h | 18 +- frame/thread/bli_thrinfo.c | 201 ++++++++++++++-- frame/thread/bli_thrinfo.h | 77 +++--- 31 files changed, 887 insertions(+), 364 deletions(-) diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c index 1c1265661..2287a7222 100644 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ b/frame/1m/packm/bli_packm_thrinfo.c @@ -34,12 +34,11 @@ #include "blis.h" +#if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -51,7 +50,6 @@ thrinfo_t* bli_packm_thrinfo_create ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, FALSE, @@ -60,14 +58,13 @@ thrinfo_t* bli_packm_thrinfo_create return thread; } +#endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -77,7 +74,6 @@ void bli_packm_thrinfo_init ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, FALSE, sub_node @@ -93,13 +89,13 @@ void bli_packm_thrinfo_init_single ( thread, &BLIS_SINGLE_COMM, 0, - &BLIS_SINGLE_COMM, 0, 1, 0, NULL ); } +#if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread @@ -109,4 +105,4 @@ void bli_packm_thrinfo_free thread != &BLIS_PACKM_SINGLE_THREADED ) bli_free_intl( thread ); } - +#endif diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 7b6d7ae4d..5da496f96 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -42,24 +42,22 @@ // thrinfo_t APIs specific to packm. // +#if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); +#endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -70,8 +68,10 @@ void bli_packm_thrinfo_init_single thrinfo_t* thread ); +#if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); +#endif diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 36b65b52b..78b2b775c 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -35,12 +35,11 @@ #include "blis.h" #include "assert.h" +#if 0 thrinfo_t* bli_l3_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -49,21 +48,19 @@ thrinfo_t* bli_l3_thrinfo_create return bli_thrinfo_create ( ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, TRUE, sub_node ); } +#endif void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -73,7 +70,6 @@ void bli_l3_thrinfo_init ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, TRUE, @@ -105,14 +101,12 @@ void bli_l3_thrinfo_free // is marked as needing them to be freed. The most common example of // thrinfo_t nodes NOT marked as needing their comms freed are those // associated with packm thrinfo_t nodes. - if ( bli_thrinfo_needs_free_comms( thread ) ) + if ( bli_thrinfo_needs_free_comm( thread ) ) { // The ochief always frees his communicator, and the ichief free its // communicator if we are at the leaf node. if ( bli_thread_am_ochief( thread ) ) bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); - if ( thrinfo_sub_node == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( bli_thrinfo_icomm( thread ) ); } // Free all children of the current thrinfo_t. @@ -124,117 +118,208 @@ void bli_l3_thrinfo_free // ----------------------------------------------------------------------------- -//#define PRINT_THRINFO - -thrinfo_t** bli_l3_thrinfo_create_paths +void bli_l3_thrinfo_create_root ( - opid_t l3_op, - side_t side + dim_t id, + thrcomm_t* gl_comm, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ) { - dim_t jc_in, jc_way; - dim_t kc_in, kc_way; - dim_t ic_in, ic_way; - dim_t jr_in, jr_way; - dim_t ir_in, ir_way; + // Query the global communicator for the total number of threads to use. + dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); -#ifdef BLIS_ENABLE_MULTITHREADING - jc_in = bli_env_read_nway( "BLIS_JC_NT" ); - //kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - kc_in = 1; - ic_in = bli_env_read_nway( "BLIS_IC_NT" ); - jr_in = bli_env_read_nway( "BLIS_JR_NT" ); - ir_in = bli_env_read_nway( "BLIS_IR_NT" ); -#else - jc_in = 1; - kc_in = 1; - ic_in = 1; - jr_in = 1; - ir_in = 1; -#endif + // Use the thread id passed in as the global communicator id. + dim_t gl_comm_id = id; - if ( l3_op == BLIS_TRMM ) - { - // We reconfigure the parallelism for trmm_r due to a dependency in - // the jc loop. (NOTE: This dependency does not exist for trmm3.) - if ( bli_is_right( side ) ) - { - jc_way = 1; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in * jc_in; - ir_way = ir_in; - } - else // if ( bli_is_left( side ) ) - { - jc_way = jc_in; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in; - ir_way = ir_in; - } - } - else if ( l3_op == BLIS_TRSM ) - { - if ( bli_is_right( side ) ) - { + // Use the blocksize id of the current (root) control tree node to + // query the top-most ways of parallelism to obtain. + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx ); - jc_way = 1; - kc_way = 1; - ic_way = jc_in * ic_in * jr_in; - jr_way = 1; - ir_way = 1; - } - else // if ( bli_is_left( side ) ) - { - jc_way = 1; - kc_way = 1; - ic_way = 1; - jr_way = ic_in * jr_in * ir_in; - ir_way = 1; - } - } - else // all other level-3 operations + // Determine the work id for this thrinfo_t node. + dim_t work_id = gl_comm_id / ( n_threads / xx_way ); + + // Create the root thrinfo_t node. + *thread = bli_thrinfo_create + ( + gl_comm, + gl_comm_id, + xx_way, + work_id, + TRUE, + NULL + ); +} + +// ----------------------------------------------------------------------------- + +void bli_l3_thrinfo_print_paths + ( + thrinfo_t** threads + ) +{ + dim_t n_threads = bli_thread_num_threads( threads[0] ); + dim_t gl_comm_id; + + thrinfo_t* jc_info = threads[0]; + thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); + thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); + thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); + thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); + thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); + thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); + + dim_t jc_way = bli_thread_n_way( jc_info ); + dim_t pc_way = bli_thread_n_way( pc_info ); + dim_t pb_way = bli_thread_n_way( pb_info ); + dim_t ic_way = bli_thread_n_way( ic_info ); + dim_t pa_way = bli_thread_n_way( pa_info ); + dim_t jr_way = bli_thread_n_way( jr_info ); + dim_t ir_way = bli_thread_n_way( ir_info ); + + dim_t gl_nt = bli_thread_num_threads( jc_info ); + dim_t jc_nt = bli_thread_num_threads( pc_info ); + dim_t pc_nt = bli_thread_num_threads( pb_info ); + dim_t pb_nt = bli_thread_num_threads( ic_info ); + dim_t ic_nt = bli_thread_num_threads( pa_info ); + dim_t pa_nt = bli_thread_num_threads( jr_info ); + dim_t jr_nt = bli_thread_num_threads( ir_info ); + + printf( " gl jc kc pb ic pa jr ir\n" ); + printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + gl_nt, jc_nt, pc_nt, pb_nt, ic_nt, pa_nt, jr_nt, (dim_t)1 ); + printf( "\n" ); + printf( " jc kc pb ic pa jr ir\n" ); + printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + jc_way, pc_way, pb_way, ic_way, pa_way, jr_way, ir_way ); + printf( "=================================================\n" ); + + for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) { - jc_way = jc_in; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in; - ir_way = ir_in; + jc_info = threads[gl_comm_id]; + pc_info = bli_thrinfo_sub_node( jc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); + dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); + dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); + dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); + dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); + dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); + dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); + + dim_t jc_work_id = bli_thread_work_id( jc_info ); + dim_t pc_work_id = bli_thread_work_id( pc_info ); + dim_t pb_work_id = bli_thread_work_id( pb_info ); + dim_t ic_work_id = bli_thread_work_id( ic_info ); + dim_t pa_work_id = bli_thread_work_id( pa_info ); + dim_t jr_work_id = bli_thread_work_id( jr_info ); + dim_t ir_work_id = bli_thread_work_id( ir_info ); + +printf( " gl jc pb kc pa ic jr \n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id ); +printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", +jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id ); +printf( "---------------------------------------\n" ); } +} - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); +// ----------------------------------------------------------------------------- - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; +#if 0 +thrinfo_t** bli_l3_thrinfo_create_roots + ( + cntx_t* cntx, + cntl_t* cntl + ) +{ + // Query the context for the total number of threads to use. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); + + // Create a global thread communicator for all the threads. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + + // Allocate an array of thrinfo_t pointers, one for each thread. + thrinfo_t** paths = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) ); + + // Use the blocksize id of the current (root) control tree node to + // query the top-most ways of parallelism to obtain. + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx ); + + dim_t gl_comm_id; + + // Create one thrinfo_t node for each thread in the (global) communicator. + for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) + { + dim_t work_id = gl_comm_id / ( n_threads / xx_way ); + + paths[ gl_comm_id ] = bli_thrinfo_create + ( + gl_comm, + gl_comm_id, + xx_way, + work_id, + TRUE, + NULL + ); + } + + return paths; +} + +//#define PRINT_THRINFO + +thrinfo_t** bli_l3_thrinfo_create_full_paths + ( + cntx_t* cntx + ) +{ + dim_t jc_way = bli_cntx_jc_way( cntx ); + dim_t pc_way = bli_cntx_pc_way( cntx ); + dim_t ic_way = bli_cntx_ic_way( cntx ); + dim_t jr_way = bli_cntx_jr_way( cntx ); + dim_t ir_way = bli_cntx_ir_way( cntx ); + + dim_t gl_nt = jc_way * pc_way * ic_way * jr_way * ir_way; + dim_t jc_nt = pc_way * ic_way * jr_way * ir_way; + dim_t pc_nt = ic_way * jr_way * ir_way; dim_t ic_nt = jr_way * ir_way; dim_t jr_nt = ir_way; dim_t ir_nt = 1; + assert( gl_nt != 0 ); + #ifdef PRINT_THRINFO -printf( " jc kc ic jr ir\n" ); -printf( "xx_way: %4lu %4lu %4lu %4lu %4lu\n", - jc_way, kc_way, ic_way, jr_way, ir_way ); +printf( " gl jc kc pb ic pa jr ir\n" ); +printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_nt, jc_nt, pc_nt, pc_nt, ic_nt, ic_nt, jr_nt, ir_nt ); printf( "\n" ); -printf( " gl jc kc ic jr ir\n" ); -printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu\n", -global_num_threads, jc_nt, kc_nt, ic_nt, jr_nt, ir_nt ); -printf( "=======================================\n" ); +printf( " jc kc pb ic pa jr ir\n" ); +printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +jc_way, pc_way, (dim_t)0, ic_way, (dim_t)0, jr_way, ir_way ); +printf( "=================================================\n" ); #endif - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); + thrinfo_t** paths = bli_malloc_intl( gl_nt * sizeof( thrinfo_t* ) ); - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( gl_nt ); for( int a = 0; a < jc_way; a++ ) { thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) + for( int b = 0; b < pc_way; b++ ) { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); + thrcomm_t* pc_comm = bli_thrcomm_create( pc_nt ); for( int c = 0; c < ic_way; c++ ) { @@ -246,73 +331,83 @@ printf( "=======================================\n" ); for( int e = 0; e < ir_way; e++ ) { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; + //thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t pc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*pc_nt + pc_comm_id; + dim_t gl_comm_id = a*jc_nt + jc_comm_id; // macro-kernel loops thrinfo_t* ir_info = bli_l3_thrinfo_create( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, ir_way, e, NULL ); thrinfo_t* jr_info = bli_l3_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, jr_way, d, ir_info ); // packa - thrinfo_t* pack_ic_in + thrinfo_t* pa_info = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, ic_nt, ic_comm_id, jr_info ); // blk_var1 thrinfo_t* ic_info = - bli_l3_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, + bli_l3_thrinfo_create( pc_comm, pc_comm_id, ic_way, c, - pack_ic_in ); + pa_info ); // packb - thrinfo_t* pack_kc_in + thrinfo_t* pb_info = - bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id, + bli_packm_thrinfo_create( pc_comm, pc_comm_id, + pc_nt, pc_comm_id, ic_info ); // blk_var3 - thrinfo_t* kc_info + thrinfo_t* pc_info = bli_l3_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_in ); + pc_way, b, + pb_info ); // blk_var2 thrinfo_t* jc_info = - bli_l3_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, + bli_l3_thrinfo_create( gl_comm, gl_comm_id, jc_way, a, - kc_info ); + pc_info ); - paths[global_comm_id] = jc_info; + paths[gl_comm_id] = jc_info; #ifdef PRINT_THRINFO -printf( " gl jc kc ic jr ir\n" ); -printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu\n", -global_comm_id, jc_comm_id, kc_comm_id, ic_comm_id, jr_comm_id, ir_comm_id ); -//printf( " a b c d e\n" ); -printf( "work ids: %4ld %4ld %4ld %4ld %4ld\n", (long int)a, (long int)b, (long int)c, (long int)d, (long int)e ); -printf( "---------------------------------------\n" ); +{ +dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); +dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); +dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); +dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); +dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); +dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); +dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); + +dim_t jc_work_id = bli_thread_work_id( jc_info ); +dim_t pc_work_id = bli_thread_work_id( pc_info ); +dim_t pb_work_id = bli_thread_work_id( pb_info ); +dim_t ic_work_id = bli_thread_work_id( ic_info ); +dim_t pa_work_id = bli_thread_work_id( pa_info ); +dim_t jr_work_id = bli_thread_work_id( jr_info ); +dim_t ir_work_id = bli_thread_work_id( ir_info ); + +printf( " gl jc pb kc pa ic jr \n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id ); +printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", +jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id ); +printf( "-------------------------------------------------\n" ); +} #endif } @@ -330,15 +425,16 @@ exit(1); void bli_l3_thrinfo_free_paths ( - thrinfo_t** threads, - dim_t num + thrinfo_t** threads ) { + dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t i; - for ( i = 0; i < num; ++i ) + for ( i = 0; i < n_threads; ++i ) bli_l3_thrinfo_free( threads[i] ); bli_free_intl( threads ); } +#endif diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 7eac72298..71dea7645 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -61,24 +61,22 @@ // thrinfo_t APIs specific to level-3 operations. // +#if 0 thrinfo_t* bli_l3_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); +#endif void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -96,15 +94,37 @@ void bli_l3_thrinfo_free // ----------------------------------------------------------------------------- -thrinfo_t** bli_l3_thrinfo_create_paths +void bli_l3_thrinfo_create_root ( - opid_t l3_op, - side_t side + dim_t id, + thrcomm_t* gl_comm, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread + ); + +void bli_l3_thrinfo_print_paths + ( + thrinfo_t** threads + ); + +// ----------------------------------------------------------------------------- + +#if 0 +thrinfo_t** bli_l3_thrinfo_create_roots + ( + cntx_t* cntx, + cntl_t* cntl + ); + +thrinfo_t** bli_l3_thrinfo_create_full_paths + ( + cntx_t* cntx ); void bli_l3_thrinfo_free_paths ( - thrinfo_t** threads, - dim_t num + thrinfo_t** threads ); +#endif diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 7be9c6a58..0148428df 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -84,10 +84,10 @@ void bli_gemm_blk_var3 c, cntx, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread) + bli_thrinfo_sub_node( thread ) ); - bli_thread_ibarrier( thread ); + bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 3f3773418..b3494b174 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -46,14 +46,21 @@ cntl_t* bli_gemm_cntl_create if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; - // Create a node for the macro-kernel. - cntl_t* gemm_cntl_bp_ke = bli_gemm_cntl_obj_create + // Create two nodes for the macro-kernel. + cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create ( - BLIS_NR, // bszid not used by macro-kernel. - macro_kernel_p, + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); + cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + gemm_cntl_bu_ke + ); + // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create ( @@ -66,7 +73,7 @@ cntl_t* bli_gemm_cntl_create FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, - gemm_cntl_bp_ke + gemm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 0782d7272..324655655 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -85,13 +85,19 @@ void bli_gemm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx ); - // Invoke the internal back-end. + // Create the first node in the thrinfo_t tree for each thread. +//thrinfo_t** infos = bli_l3_thrinfo_create_full_paths( cntx ); +//bli_l3_thrinfo_print_paths( infos ); +//exit(1); +//cntl = bli_gemm_cntl_create( BLIS_GEMM ); + //thrinfo_t** infos = bli_l3_thrinfo_create_roots( cntx, cntl ); + + // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -99,10 +105,12 @@ void bli_gemm_front beta, &c_local, cntx, - cntl, - infos + cntl ); +//bli_l3_thrinfo_print_paths( infos ); +//exit(1); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Free the thrinfo_t structures. + //bli_l3_thrinfo_free_paths( infos ); } diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 18e531879..b24f2a25d 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -50,7 +50,6 @@ void bli_gemm_int obj_t b_local; obj_t c_local; gemm_voft f; - ind_t im; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -102,17 +101,22 @@ void bli_gemm_int bli_obj_scalar_apply_scalar( beta, &c_local ); } + // Create the next node in the thrinfo_t structure. + bli_thrinfo_grow( cntx, cntl, thread ); + // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. - im = bli_cntx_get_ind_method( cntx ); - - if ( im != BLIS_NAT ) { - if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; - else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; - else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; + ind_t im = bli_cntx_get_ind_method( cntx ); + + if ( im != BLIS_NAT ) + { + if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; + else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; + else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; + } } // Invoke the variant. diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index ed7e03b9c..8bede097b 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -92,13 +92,12 @@ void bli_hemm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -106,10 +105,7 @@ void bli_hemm_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index f72dedf87..7350b5785 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -110,14 +110,14 @@ void bli_her2k_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); - // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx ); - // Invoke the internal back-end. + // Invoke herk twice, using beta only the first time. + + // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -125,13 +125,11 @@ void bli_her2k_front beta, &c_local, cntx, - cntl, - infos + cntl ); bli_l3_thread_decorator ( - n_threads, bli_gemm_int, &alpha_conj, &b_local, @@ -139,12 +137,9 @@ void bli_her2k_front &BLIS_ONE, &c_local, cntx, - cntl, - infos + cntl ); - bli_l3_thrinfo_free_paths( infos, n_threads ); - // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 3abfa9baf..7fcd2d356 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -90,13 +90,12 @@ void bli_herk_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -104,12 +103,9 @@ void bli_herk_front beta, &c_local, cntx, - cntl, - infos + cntl ); - bli_l3_thrinfo_free_paths( infos, n_threads ); - // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-k product should always be diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index b864ce06a..cd2f3a20e 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -91,13 +91,12 @@ void bli_symm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -105,10 +104,7 @@ void bli_symm_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 936c43635..47ce91795 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -91,14 +91,14 @@ void bli_syr2k_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx ); + // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -106,13 +106,11 @@ void bli_syr2k_front beta, &c_local, cntx, - cntl, - infos + cntl ); bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &b_local, @@ -120,10 +118,7 @@ void bli_syr2k_front &BLIS_ONE, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 8b379ab0e..f037eb1c1 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -84,13 +84,12 @@ void bli_syrk_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -98,10 +97,7 @@ void bli_syrk_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 689acbb72..c7231c839 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -134,13 +134,12 @@ void bli_trmm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_TRMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -148,10 +147,7 @@ void bli_trmm_front &BLIS_ZERO, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index e9e9261f0..cf97bbcf2 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -133,13 +133,12 @@ void bli_trmm3_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_TRMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -147,10 +146,7 @@ void bli_trmm3_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 9d726389f..7b428c8ef 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -87,7 +87,8 @@ void bli_trsm_blk_var3 bli_thrinfo_sub_node( thread ) ); - bli_thread_ibarrier( thread ); + //bli_thread_ibarrier( thread ); + bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index b4f7422ba..78bd5eeb9 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -50,14 +50,21 @@ cntl_t* bli_trsm_l_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; - // Create a node for the macro-kernel. - cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + // Create two nodes for the macro-kernel. + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create ( - BLIS_NR, // bszid not used by macro-kernel. - macro_kernel_p, + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + trsm_cntl_bu_ke + ); + // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create ( @@ -70,7 +77,7 @@ cntl_t* bli_trsm_l_cntl_create FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, - trsm_cntl_bp_ke + trsm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. @@ -122,14 +129,21 @@ cntl_t* bli_trsm_r_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; - // Create a node for the macro-kernel. - cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + // Create two nodes for the macro-kernel. + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create ( - BLIS_NR, // bszid not used by macro-kernel. - macro_kernel_p, + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + trsm_cntl_bu_ke + ); + // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create ( @@ -142,7 +156,7 @@ cntl_t* bli_trsm_r_cntl_create FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, - trsm_cntl_bp_ke + trsm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 3466d2d18..95c2d6aab 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -119,13 +119,12 @@ void bli_trsm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_TRSM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_trsm_int, alpha, &a_local, @@ -133,10 +132,7 @@ void bli_trsm_front alpha, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index e6614cb3f..796af7866 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -117,6 +117,9 @@ void bli_trsm_int // FGVZ->TMS: Is this barrier still needed? bli_thread_obarrier( thread ); + // Create the next node in the thrinfo_t structure. + bli_thrinfo_grow( cntx, cntl, thread ); + // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index f2885cca3..31e995e1b 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -341,6 +341,37 @@ pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ) } #endif +dim_t bli_cntx_get_num_threads( cntx_t* cntx ) +{ + return bli_cntx_jc_way( cntx ) * + bli_cntx_pc_way( cntx ) * + bli_cntx_ic_way( cntx ) * + bli_cntx_jr_way( cntx ) * + bli_cntx_ir_way( cntx ); +} + +dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) +{ + dim_t n_threads_in = 1; + + for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) ) + { + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t cur_way; + + // We assume bszid is in {KR,MR,NR,MC,KC,NR} if it is not + // BLIS_NO_PART. + if ( bszid != BLIS_NO_PART ) + cur_way = bli_cntx_way_for_bszid( bszid, cntx ); + else + cur_way = 1; + + n_threads_in *= cur_way; + } + + return n_threads_in; +} + // ----------------------------------------------------------------------------- #if 1 @@ -663,6 +694,96 @@ void bli_cntx_set_pack_schema_c( pack_t schema_c, bli_cntx_set_schema_c( schema_c, cntx ); } +void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx ) +{ + dim_t jc, pc, ic, jr, ir; + +#ifdef BLIS_ENABLE_MULTITHREADING + jc = bli_env_read_nway( "BLIS_JC_NT" ); + //pc = bli_env_read_nway( "BLIS_KC_NT" ); + pc = 1; + ic = bli_env_read_nway( "BLIS_IC_NT" ); + jr = bli_env_read_nway( "BLIS_JR_NT" ); + ir = bli_env_read_nway( "BLIS_IR_NT" ); +#else + jc = 1; + pc = 1; + ic = 1; + jr = 1; + ir = 1; +#endif + + if ( l3_op == BLIS_TRMM ) + { + // We reconfigure the paralelism from trmm_r due to a dependency in + // the jc loop. (NOTE: This dependency does not exist for trmm3 ) + if ( bli_is_right( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + pc, + ic, + jr * jc, + ir, + cntx + ); + } + else // if ( bli_is_left( side ) ) + { + bli_cntx_set_thrloop + ( + jc, + pc, + ic, + jr, + ir, + cntx + ); + } + } + else if ( l3_op == BLIS_TRSM ) + { + if ( bli_is_right( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + 1, + jc * ic * jr, + 1, + 1, + cntx + ); + } + else // if ( bli_is_left( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + 1, + 1, + ic * jr * ir, + 1, + cntx + ); + } + } + else // if ( l3_op == BLIS_TRSM ) + { + bli_cntx_set_thrloop + ( + jc, + pc, + ic, + jr, + ir, + cntx + ); + } +} + + // ----------------------------------------------------------------------------- bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 21f9c0fe0..6aed68111 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -59,6 +59,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + dim_t* thrloop; + membrk_t* membrk; } cntx_t; */ @@ -127,6 +129,36 @@ typedef struct cntx_s \ ( (cntx)->membrk ) +#define bli_cntx_thrloop( cntx ) \ +\ + ( (cntx)->thrloop ) + +#if 1 +#define bli_cntx_jc_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_NC ] ) + +#define bli_cntx_pc_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_KC ] ) + +#define bli_cntx_ic_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_MC ] ) + +#define bli_cntx_jr_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_NR ] ) + +#define bli_cntx_ir_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_MR ] ) +#endif + +#define bli_cntx_way_for_bszid( bszid, cntx ) \ +\ + ( (cntx)->thrloop[ bszid ] ) + // cntx_t modification (fields only) #define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \ @@ -199,6 +231,16 @@ typedef struct cntx_s (cntx_p)->membrk = _membrk; \ } +#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \ +{ \ + (cntx_p)->thrloop[ BLIS_NC ] = jc_; \ + (cntx_p)->thrloop[ BLIS_KC ] = pc_; \ + (cntx_p)->thrloop[ BLIS_MC ] = ic_; \ + (cntx_p)->thrloop[ BLIS_NR ] = jr_; \ + (cntx_p)->thrloop[ BLIS_MR ] = ir_; \ + (cntx_p)->thrloop[ BLIS_KR ] = 1; \ +} + // cntx_t query (complex) #define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \ @@ -356,6 +398,8 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx ); +dim_t bli_cntx_get_num_threads( cntx_t* cntx ); +dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ); // set functions @@ -390,6 +434,9 @@ void bli_cntx_set_pack_schema_b( pack_t schema_b, cntx_t* cntx ); void bli_cntx_set_pack_schema_c( pack_t schema_c, cntx_t* cntx ); +void bli_cntx_set_thrloop_from_env( opid_t l3_op, + side_t side, + cntx_t* cntx ); // other query functions diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 086740cfd..726f4a700 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -638,6 +638,21 @@ typedef enum #define BLIS_NUM_UKR_IMPL_TYPES 4 +#if 0 +typedef enum +{ + BLIS_JC_IDX = 0, + BLIS_PC_IDX, + BLIS_IC_IDX, + BLIS_JR_IDX, + BLIS_IR_IDX, + BLIS_PR_IDX, +} thridx_t; +#endif + +#define BLIS_NUM_LOOPS 6 + + // -- Operation ID type -- typedef enum @@ -949,6 +964,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + dim_t thrloop[ BLIS_NUM_LOOPS ]; + membrk_t* membrk; } cntx_t; diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 6b4d2de1a..593f8d7fa 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -41,6 +41,12 @@ #include "bli_thrcomm_openmp.h" #include "bli_thrcomm_pthreads.h" + +// thrcomm_t query (field only) + +#define bli_thrcomm_num_threads( comm ) ( (comm)->n_threads ) + + // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( dim_t n_threads ); void bli_thrcomm_free( thrcomm_t* communicator ); diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 7c1fe69f9..68d9d7a29 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -201,7 +201,6 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) void bli_l3_thread_decorator ( - dim_t n_threads, l3int_t func, obj_t* alpha, obj_t* a, @@ -209,20 +208,28 @@ void bli_l3_thread_decorator obj_t* beta, obj_t* c, cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + cntl_t* cntl ) { + // Query the total number of threads from the context. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + _Pragma( "omp parallel num_threads(n_threads)" ) { - dim_t omp_id = omp_get_thread_num(); - thrinfo_t* thread_i = thread[omp_id]; + dim_t id = omp_get_thread_num(); cntl_t* cntl_use; + thrinfo_t* thread; // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); + func ( alpha, @@ -232,12 +239,19 @@ void bli_l3_thread_decorator c, cntx, cntl_use, - thread[omp_id] + thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). } #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 0f2707d91..230b63905 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -136,7 +136,8 @@ typedef struct thread_data obj_t* c; cntx_t* cntx; cntl_t* cntl; - thrinfo_t* thread; + dim_t id; + thrcomm_t* gl_comm; } thread_data_t; // Entry point for additional threads @@ -151,13 +152,18 @@ void* bli_l3_thread_entry( void* data_void ) obj_t* c = data->c; cntx_t* cntx = data->cntx; cntl_t* cntl = data->cntl; - thrinfo_t* thread_i = data->thread; + dim_t id = data->id; + thrcomm_t* gl_comm = data->gl_comm; cntl_t* cntl_use; + thrinfo_t* thread; // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); + data->func ( alpha, @@ -171,14 +177,16 @@ void* bli_l3_thread_entry( void* data_void ) ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); return NULL; } void bli_l3_thread_decorator ( - dim_t n_threads, l3int_t func, obj_t* alpha, obj_t* a, @@ -186,50 +194,51 @@ void bli_l3_thread_decorator obj_t* beta, obj_t* c, cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + cntl_t* cntl ) { - pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + // Query the total number of threads from the context. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); - for ( int i = 1; i < n_threads; i++ ) + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + + // Allocate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t id = n_threads - 1; 0 <= id; id-- ) { // Set up thread data for additional threads (beyond thread 0). - datas[i].func = func; - datas[i].alpha = alpha; - datas[i].a = a; - datas[i].b = b; - datas[i].beta = beta; - datas[i].c = c; - datas[i].cntx = cntx; - datas[i].cntl = cntl; - datas[i].thread = thread[i]; + datas[id].func = func; + datas[id].alpha = alpha; + datas[id].a = a; + datas[id].b = b; + datas[id].beta = beta; + datas[id].c = c; + datas[id].cntx = cntx; + datas[id].cntl = cntl; + datas[id].id = id; + datas[id].gl_comm = gl_comm; - // Spawn additional threads. - pthread_create( &pthreads[i], NULL, &bli_l3_thread_entry, &datas[i] ); - } - - - // The main thread executes this. - { - cntl_t* cntl_use; - - // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); - - // Thread 0 simply executes func. - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); - - // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread[0] ); + // Spawn additional threads for ids greater than 1. + if ( id != 0 ) + pthread_create( &pthreads[id], NULL, &bli_l3_thread_entry, &datas[id] ); + else + bli_l3_thread_entry( ( void* )(&datas[0]) ); } + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). // Thread 0 waits for additional threads to finish. - for ( int i = 1; i < n_threads; i++) + for ( dim_t id = 1; id < n_threads; id++ ) { - pthread_join( pthreads[i], NULL ); + pthread_join( pthreads[id], NULL ); } bli_free_intl( pthreads ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index 99de67220..c038f59a0 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -73,7 +73,6 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) void bli_l3_thread_decorator ( - dim_t n_threads, l3int_t func, obj_t* alpha, obj_t* a, @@ -81,17 +80,25 @@ void bli_l3_thread_decorator obj_t* beta, obj_t* c, cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + cntl_t* cntl ) { - thrinfo_t* thread_i = thread[0]; + // For sequential execution, we use only one thread. + dim_t n_threads = 1; + dim_t id = 0; + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); cntl_t* cntl_use; + thrinfo_t* thread; // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + // Create the root node of the thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); + func ( alpha, @@ -101,11 +108,18 @@ void bli_l3_thread_decorator c, cntx, cntl_use, - thread[0] + thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). } diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 43f0eaf8b..d42744162 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -78,8 +78,8 @@ void bli_thread_get_range_sub dim_t* end ) { - dim_t n_way = thread->n_way; - dim_t work_id = thread->work_id; + dim_t n_way = bli_thread_n_way( thread ); + dim_t work_id = bli_thread_work_id( thread ); dim_t all_start = 0; dim_t all_end = n; @@ -511,8 +511,8 @@ siz_t bli_thread_get_range_weighted_sub dim_t* j_end_thr ) { - dim_t n_way = thread->n_way; - dim_t my_id = thread->work_id; + dim_t n_way = bli_thread_n_way( thread ); + dim_t my_id = bli_thread_work_id( thread ); dim_t bf_left = n % bf; diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 10097c39e..5b9443587 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -173,16 +173,14 @@ typedef void (*l3int_t) // Level-3 thread decorator prototype void bli_l3_thread_decorator ( - dim_t n_threads, - l3int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl ); // Miscellaneous prototypes diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index 4cf55b3d4..bad5c2772 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -38,11 +38,9 @@ thrinfo_t* bli_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ) { @@ -52,9 +50,8 @@ thrinfo_t* bli_thrinfo_create ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, - free_comms, + free_comm, sub_node ); @@ -66,23 +63,19 @@ void bli_thrinfo_init thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ) { - thread->ocomm = ocomm; - thread->ocomm_id = ocomm_id; - thread->icomm = icomm; - thread->icomm_id = icomm_id; - thread->n_way = n_way; - thread->work_id = work_id; - thread->free_comms = free_comms; + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->free_comm = free_comm; - thread->sub_node = sub_node; + thread->sub_node = sub_node; } void bli_thrinfo_init_single @@ -94,7 +87,6 @@ void bli_thrinfo_init_single ( thread, &BLIS_SINGLE_COMM, 0, - &BLIS_SINGLE_COMM, 0, 1, 0, FALSE, @@ -102,3 +94,178 @@ void bli_thrinfo_init_single ); } +// ----------------------------------------------------------------------------- + +#include "assert.h" + +#define BLIS_NUM_STATIC_COMMS 18 + +thrinfo_t* bli_thrinfo_create_for_cntl + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_chl, + thrinfo_t* thread_par + ) +{ + thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; + thrcomm_t** new_comms = NULL; + + thrinfo_t* thread_chl; + + bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); + + dim_t parent_nt_in = bli_thread_num_threads( thread_par ); + dim_t parent_n_way = bli_thread_n_way( thread_par ); + dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); + dim_t parent_work_id = bli_thread_work_id( thread_par ); + + dim_t child_nt_in; + dim_t child_comm_id; + dim_t child_n_way; + dim_t child_work_id; + + // Sanity check: make sure the number of threads in the parent's + // communicator is divisible by the number of new sub-groups. + assert( parent_nt_in % parent_n_way == 0 ); + + // Compute: + // - the number of threads inside the new child comm, + // - the current thread's id within the new communicator, + // - the current thread's work id, given the ways of parallelism + // to be obtained within the next loop. + child_nt_in = bli_cntx_get_num_threads_in( cntx, cntl_chl ); + child_n_way = bli_cntx_way_for_bszid( bszid_chl, cntx ); + child_comm_id = parent_comm_id % child_nt_in; + child_work_id = child_comm_id / ( child_nt_in / child_n_way ); + + // The parent's chief thread creates a temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ) ); + else + new_comms = static_comms; + } + + // Broadcast the temporary array to all threads in the parent's + // communicator. + new_comms = bli_thread_obroadcast( thread_par, new_comms ); + + // Chiefs in the child communicator allocate the communicator + // object and store it in the array element corresponding to the + // parent's work id. + if ( child_comm_id == 0 ) + new_comms[ parent_work_id ] = bli_thrcomm_create( child_nt_in ); + + bli_thread_obarrier( thread_par ); + + // All threads create a new thrinfo_t node using the communicator + // that was created by their chief, as identified by parent_work_id. + thread_chl = bli_thrinfo_create + ( + new_comms[ parent_work_id ], + child_comm_id, + child_n_way, + child_work_id, + TRUE, + NULL + ); + + bli_thread_obarrier( thread_par ); + + // The parent's chief thread frees the temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + bli_free_intl( new_comms ); + } + + return thread_chl; +} + +void bli_thrinfo_grow + ( + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // If the sub-node of the thrinfo_t object is non-NULL, we don't + // need to create it, and will just use the existing sub-node as-is. + if ( bli_thrinfo_sub_node( thread ) != NULL ) return; + + // Create a new node (or, if needed, multiple nodes) and return the + // pointer to the (eldest) child. + thrinfo_t* thread_child = bli_thrinfo_rgrow + ( + cntx, + cntl, + bli_cntl_sub_node( cntl ), + thread + ); + + // Attach the child thrinfo_t node to its parent structure. + bli_thrinfo_set_sub_node( thread_child, thread ); +} + +thrinfo_t* bli_thrinfo_rgrow + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_cur, + thrinfo_t* thread_par + ) +{ + thrinfo_t* thread_cur; + + // We must handle two cases: those where the next node in the + // control tree is a partitioning node, and those where it is + // a non-partitioning (ie: packing) node. + if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) + { + // Create the child thrinfo_t node corresponding to cntl_cur, + // with cntl_par being the parent. + thread_cur = bli_thrinfo_create_for_cntl + ( + cntx, + cntl_par, + cntl_cur, + thread_par + ); + } + else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) + { + // Recursively grow the thread structure and return the top-most + // thrinfo_t node of that segment. + thrinfo_t* thread_seg = bli_thrinfo_rgrow + ( + cntx, + cntl_par, + bli_cntl_sub_node( cntl_cur ), + thread_par + ); + + // Create a thrinfo_t node corresponding to cntl_cur. Notice that + // the free_comm field is set to FALSE, since cntl_cur is a + // non-partitioning node. The communicator used here will be + // freed when thread_seg, or one of its descendents, is freed. + thread_cur = bli_thrinfo_create + ( + bli_thrinfo_ocomm( thread_seg ), + bli_thread_ocomm_id( thread_seg ), + bli_cntx_get_num_threads_in( cntx, cntl_cur ), + bli_thread_ocomm_id( thread_seg ), + FALSE, + thread_seg + ); + + // Attach the child thrinfo_t node to its parent structure. + bli_thrinfo_set_sub_node( thread_cur, thread_par ); + } + + return thread_cur; +} + diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 9c0b28575..93bf19e50 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -45,13 +45,6 @@ struct thrinfo_s // Our thread id within the ocomm thread communicator. dim_t ocomm_id; - // The thread communicator for the other threads sharing the same work - // at this level. - thrcomm_t* icomm; - - // Our thread id within the icomm thread communicator. - dim_t icomm_id; - // The number of distinct threads used to parallelize the loop. dim_t n_way; @@ -62,7 +55,7 @@ struct thrinfo_s // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. - bool_t free_comms; + bool_t free_comm; struct thrinfo_s* sub_node; }; @@ -71,30 +64,40 @@ typedef struct thrinfo_s thrinfo_t; // // thrinfo_t macros // NOTE: The naming of these should be made consistent at some point. +// (ie: bli_thrinfo_ vs. bli_thread_) // -#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads ) +// thrinfo_t query (field only) -#define bli_thread_n_way( t ) ( (t)->n_way ) -#define bli_thread_work_id( t ) ( (t)->work_id ) +#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads ) -#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 ) -#define bli_thread_am_ichief( t ) ( (t)->icomm_id == 0 ) +#define bli_thread_n_way( t ) ( (t)->n_way ) +#define bli_thread_work_id( t ) ( (t)->work_id ) +#define bli_thread_ocomm_id( t ) ( (t)->ocomm_id ) + +#define bli_thrinfo_ocomm( t ) ( (t)->ocomm ) +#define bli_thrinfo_needs_free_comm( t ) ( (t)->free_comm ) + +#define bli_thrinfo_sub_node( t ) ( (t)->sub_node ) + +// thrinfo_t query (complex) + +#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 ) + +// thrinfo_t modification + +#define bli_thrinfo_set_sub_node( _sub_node, thread ) \ +{ \ + (thread)->sub_node = _sub_node; \ +} + +// other thrinfo_t-related macros #define bli_thread_obroadcast( t, p ) bli_thrcomm_bcast( (t)->ocomm, \ (t)->ocomm_id, p ) -#define bli_thread_ibroadcast( t, p ) bli_thrcomm_bcast( (t)->icomm, \ - (t)->icomm_id, p ) #define bli_thread_obarrier( t ) bli_thrcomm_barrier( (t)->ocomm, \ (t)->ocomm_id ) -#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( (t)->icomm, \ - (t)->icomm_id ) -#define bli_thrinfo_ocomm( t ) ( (t)->ocomm ) -#define bli_thrinfo_icomm( t ) ( (t)->icomm ) -#define bli_thrinfo_needs_free_comms( t ) ( (t)->free_comms ) - -#define bli_thrinfo_sub_node( t ) ( (t)->sub_node ) // // Prototypes for level-3 thrinfo functions not specific to any operation. @@ -104,11 +107,9 @@ thrinfo_t* bli_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ); @@ -117,11 +118,9 @@ void bli_thrinfo_init thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ); @@ -130,9 +129,29 @@ void bli_thrinfo_init_single thrinfo_t* thread ); -void bli_thrinfo_free +// ----------------------------------------------------------------------------- + +thrinfo_t* bli_thrinfo_create_for_cntl ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_chl, + thrinfo_t* thread_par + ); + +void bli_thrinfo_grow + ( + cntx_t* cntx, + cntl_t* cntl, thrinfo_t* thread ); +thrinfo_t* bli_thrinfo_rgrow + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_cur, + thrinfo_t* thread_par + ); + #endif From 86969873b5b861966d717d8f9f370af39e3d9de6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 4 Oct 2016 14:24:59 -0500 Subject: [PATCH 15/27] Reclassified amaxv operation as a level-1v kernel. Details: - Moved amaxv from being a utility operation to being a level-1v operation. This includes the establishment of a new amaxv kernel to live beside all of the other level-1v kernels. - Added two new functions to bli_part.c: bli_acquire_mij() bli_acquire_vi() The first acquires a scalar object for the (i,j) element of a matrix, and the second acquires a scalar object for the ith element of a vector. - Added integer support to bli_getsc level-0 operation. This involved adding integer support to the bli_*gets level-0 scalar macros. - Added a new test module to test amaxv as a level-1v operation. The test module works by comparing the value identified by bli_amaxv() to the the value found from a reference-like code local to the test module source file. In other words, it (intentionally) does not guarantee the same index is found; only the same value. This allows for different implementations in the case where a vector contains two or more elements containing exactly the same floating point value (or values, in the case of the complex domain). - Removed the directory frame/include/old/. --- frame/0/bli_l0_check.c | 8 +- frame/0/bli_l0_oapi.c | 8 +- frame/0/bli_l0_tapi.c | 22 + frame/0/bli_l0_tapi.h | 16 + frame/1/bli_l1v_check.c | 51 ++ frame/1/bli_l1v_check.h | 30 +- frame/1/bli_l1v_cntx.c | 1 + frame/1/bli_l1v_cntx.h | 1 + frame/1/bli_l1v_ft.h | 15 + frame/1/bli_l1v_ker.h | 14 + frame/1/bli_l1v_oapi.c | 38 ++ frame/1/bli_l1v_oapi.h | 13 + frame/1/bli_l1v_tapi.c | 32 ++ frame/1/bli_l1v_tapi.h | 3 + frame/1/kernels/bli_amaxv_ref.c | 134 +++++ frame/base/bli_gks.c | 6 + frame/base/bli_part.c | 32 ++ frame/base/bli_part.h | 17 + frame/compat/bla_amax.c | 3 +- frame/include/bli_kernel_macro_defs.h | 18 + frame/include/bli_kernel_pre_macro_defs.h | 7 + frame/include/bli_kernel_prototypes.h | 5 + frame/include/bli_param_macro_defs.h | 8 + frame/include/bli_type_defs.h | 3 +- frame/include/level0/bli_gets.h | 11 + frame/include/level0/bli_sets.h | 11 + .../include/old/bli_kernel_post_macro_defs.h | 125 ----- frame/include/old/bli_kernel_prototypes.h | 529 ------------------ frame/include/old/bli_kernel_type_defs.h | 137 ----- frame/util/bli_util_check.c | 51 -- frame/util/bli_util_check.h | 12 - frame/util/bli_util_oapi.c | 38 -- frame/util/bli_util_oapi.h | 13 - frame/util/bli_util_tapi.c | 44 -- frame/util/bli_util_tapi.h | 14 - frame/util/bli_util_unb_var1.c | 65 --- frame/util/bli_util_unb_var1.h | 14 - testsuite/input.operations | 4 + testsuite/src/test_amaxv.c | 400 +++++++++++++ testsuite/src/test_amaxv.h | 40 ++ testsuite/src/test_libblis.c | 6 +- testsuite/src/test_libblis.h | 2 + 42 files changed, 941 insertions(+), 1060 deletions(-) create mode 100644 frame/1/kernels/bli_amaxv_ref.c delete mode 100644 frame/include/old/bli_kernel_post_macro_defs.h delete mode 100644 frame/include/old/bli_kernel_prototypes.h delete mode 100644 frame/include/old/bli_kernel_type_defs.h create mode 100644 testsuite/src/test_amaxv.c create mode 100644 testsuite/src/test_amaxv.h diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c index da47a6fd5..fc1c4c71a 100644 --- a/frame/0/bli_l0_check.c +++ b/frame/0/bli_l0_check.c @@ -99,8 +99,8 @@ void bli_getsc_check // Check object datatypes. - e_val = bli_check_noninteger_object( chi ); - bli_check_error_code( e_val ); + //e_val = bli_check_noninteger_object( chi ); + //bli_check_error_code( e_val ); // Check object dimensions. @@ -125,8 +125,8 @@ void bli_setsc_check // Check object datatypes. - e_val = bli_check_floating_object( chi ); - bli_check_error_code( e_val ); + //e_val = bli_check_floating_object( chi ); + //bli_check_error_code( e_val ); // Check object dimensions. diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c index d20f8ea45..3858e05b7 100644 --- a/frame/0/bli_l0_oapi.c +++ b/frame/0/bli_l0_oapi.c @@ -198,8 +198,8 @@ void PASTEMAC0(opname) \ if ( bli_is_constant( dt_chi ) ) dt_use = dt_def; \ else dt_use = dt_chi; \ \ - /* Invoke the typed function. */ \ - bli_call_ft_3 \ + /* Invoke the typed function (with integer support). */ \ + bli_call_ft_3i \ ( \ dt_use, \ opname, \ @@ -229,8 +229,8 @@ void PASTEMAC0(opname) \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \ \ - /* Invoke the typed function. */ \ - bli_call_ft_3 \ + /* Invoke the typed function (with integer support). */ \ + bli_call_ft_3i \ ( \ dt_chi, \ opname, \ diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c index 53f5be271..028a12cbd 100644 --- a/frame/0/bli_l0_tapi.c +++ b/frame/0/bli_l0_tapi.c @@ -227,3 +227,25 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNCR_BASIC0( zipsc ) +// ----------------------------------------------------------------------------- + +void bli_igetsc + ( + dim_t* chi, + double* zeta_r, + double* zeta_i + ) +{ + PASTEMAC2(i,d,gets)( *chi, *zeta_r, *zeta_i ); +} + +void bli_isetsc + ( + double zeta_r, + double zeta_i, + dim_t* chi + ) +{ + PASTEMAC2(d,i,sets)( zeta_r, zeta_i, *chi ); +} + diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h index 678e27292..36b282824 100644 --- a/frame/0/bli_l0_tapi.h +++ b/frame/0/bli_l0_tapi.h @@ -141,3 +141,19 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROTR_BASIC( zipsc ) +// ----------------------------------------------------------------------------- + +void bli_igetsc + ( + dim_t* chi, + double* zeta_r, + double* zeta_i + ); + +void bli_isetsc + ( + double zeta_r, + double zeta_i, + dim_t* chi + ); + diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c index b998a65fb..54c856b45 100644 --- a/frame/1/bli_l1v_check.c +++ b/frame/1/bli_l1v_check.c @@ -56,6 +56,21 @@ GENFRONT( subv ) GENFRONT( swapv ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* index \ + ) \ +{ \ + bli_l1v_xi_check( x, index ); \ +} + +GENFRONT( amaxv ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ @@ -481,3 +496,39 @@ void bli_l1v_ax_check bli_check_error_code( e_val ); } +void bli_l1v_xi_check + ( + obj_t* x, + obj_t* index + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_integer_object( index ); + bli_check_error_code( e_val ); + + e_val = bli_check_nonconstant_object( index ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_vector_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( index ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( index ); + bli_check_error_code( e_val ); +} + diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h index d4a1e9ff9..ddfe6a050 100644 --- a/frame/1/bli_l1v_check.h +++ b/frame/1/bli_l1v_check.h @@ -44,7 +44,7 @@ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ - ); + ); GENTPROT( addv ) GENTPROT( copyv ) @@ -52,6 +52,18 @@ GENTPROT( subv ) GENTPROT( swapv ) +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* index \ + ); + +GENTPROT( amaxv ) + + #undef GENTPROT #define GENTPROT( opname ) \ \ @@ -74,7 +86,7 @@ void PASTEMAC(opname,_check) \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ - ); + ); GENTPROT( axpyv ) GENTPROT( scal2v ) @@ -88,7 +100,7 @@ void PASTEMAC(opname,_check) \ obj_t* x, \ obj_t* y, \ obj_t* rho \ - ); + ); GENTPROT( dotv ) @@ -103,7 +115,7 @@ void PASTEMAC(opname,_check) \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ - ); + ); GENTPROT( dotxv ) @@ -114,7 +126,7 @@ GENTPROT( dotxv ) void PASTEMAC(opname,_check) \ ( \ obj_t* x \ - ); + ); GENTPROT( invertv ) @@ -126,7 +138,7 @@ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ - ); + ); GENTPROT( scalv ) GENTPROT( setv ) @@ -196,3 +208,9 @@ void bli_l1v_ax_check obj_t* x ); +void bli_l1v_xi_check + ( + obj_t* x, + obj_t* index + ); + diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index a1bba0354..bdbb0063f 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -55,6 +55,7 @@ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ } GENFRONT( addv, BLIS_ADDV_KER ) +GENFRONT( amaxv, BLIS_AMAXV_KER ) GENFRONT( copyv, BLIS_COPYV_KER ) GENFRONT( dotv, BLIS_DOTV_KER ) GENFRONT( dotxv, BLIS_DOTXV_KER ) diff --git a/frame/1/bli_l1v_cntx.h b/frame/1/bli_l1v_cntx.h index a8c16d342..95cd4a131 100644 --- a/frame/1/bli_l1v_cntx.h +++ b/frame/1/bli_l1v_cntx.h @@ -44,6 +44,7 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addv ) +GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( copyv ) diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h index c4e206df7..b2b80e016 100644 --- a/frame/1/bli_l1v_ft.h +++ b/frame/1/bli_l1v_ft.h @@ -58,6 +58,21 @@ INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) +// amaxv + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); + +INSERT_GENTDEF( amaxv ) + // axpbyv #undef GENTDEF diff --git a/frame/1/bli_l1v_ker.h b/frame/1/bli_l1v_ker.h index cf80eda46..8039905b7 100644 --- a/frame/1/bli_l1v_ker.h +++ b/frame/1/bli_l1v_ker.h @@ -54,6 +54,20 @@ INSERT_GENTPROT_BASIC( copyv_ker_name ) INSERT_GENTPROT_BASIC( subv_ker_name ) +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); \ + +INSERT_GENTPROT_BASIC( amaxv_ker_name ) + + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c index cebc3bfb5..67525d68c 100644 --- a/frame/1/bli_l1v_oapi.c +++ b/frame/1/bli_l1v_oapi.c @@ -82,6 +82,44 @@ GENFRONT( copyv ) GENFRONT( subv ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* index \ + BLIS_OAPI_CNTX_PARAM \ + ) \ +{ \ + BLIS_OAPI_CNTX_DECL \ +\ + num_t dt = bli_obj_datatype( *x ); \ +\ + dim_t n = bli_obj_vector_dim( *x ); \ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_index = bli_obj_buffer_at_off( *index ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( x, index ); \ +\ + /* Invoke the typed function. */ \ + bli_call_ft_5 \ + ( \ + dt, \ + opname, \ + n, \ + buf_x, incx, \ + buf_index, \ + cntx \ + ); \ +} + +GENFRONT( amaxv ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h index ff277421c..1c7e534da 100644 --- a/frame/1/bli_l1v_oapi.h +++ b/frame/1/bli_l1v_oapi.h @@ -52,6 +52,19 @@ GENTPROT( copyv ) GENTPROT( subv ) +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* index \ + BLIS_OAPI_CNTX_PARAM \ + ); + +GENTPROT( amaxv ) + + #undef GENTPROT #define GENTPROT( opname ) \ \ diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 4cf6be24e..74a548eea 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -74,6 +74,38 @@ INSERT_GENTFUNC_BASIC( copyv, BLIS_COPYV_KER ) INSERT_GENTFUNC_BASIC( subv, BLIS_SUBV_KER ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, kerid ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* index, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ +\ + bli_cntx_init_local_if( opname, cntx, cntx_p ); \ +\ + PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ +\ + f \ + ( \ + n, \ + x, incx, \ + index, \ + cntx_p \ + ); \ +\ + bli_cntx_finalize_local_if( opname, cntx ); \ +} + +INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h index b4b36b059..86cdf416d 100644 --- a/frame/1/bli_l1v_tapi.h +++ b/frame/1/bli_l1v_tapi.h @@ -40,6 +40,9 @@ #undef addv_ker_name #define addv_ker_name addv +#undef amaxv_ker_name +#define amaxv_ker_name amaxv + #undef axpbyv_ker_name #define axpbyv_ker_name axpbyv diff --git a/frame/1/kernels/bli_amaxv_ref.c b/frame/1/kernels/bli_amaxv_ref.c new file mode 100644 index 000000000..f207b799f --- /dev/null +++ b/frame/1/kernels/bli_amaxv_ref.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define BLAS-like interfaces with typed operands. +// + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* i_max, \ + cntx_t* cntx \ + ) \ +{ \ + ctype_r* minus_one = PASTEMAC(chr,m1); \ + dim_t* zero_i = PASTEMAC(i,0); \ +\ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r abs_chi1; \ + ctype_r abs_chi1_max; \ + dim_t i; \ +\ + /* Initialize the index of the maximum absolute value to zero. */ \ + PASTEMAC(i,copys)( zero_i, *i_max ); \ +\ + /* If the vector length is zero, return early. This directly emulates + the behavior of netlib BLAS's i?amax() routines. */ \ + if ( bli_zero_dim1( n ) ) return; \ +\ + /* Initialize the maximum absolute value search candidate with + -1, which is guaranteed to be less than all values we will + compute. */ \ + PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ +\ + if ( incx == 1 ) \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ + else \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + ctype* chi1 = x + (i )*incx; \ +\ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCR_BASIC0( amaxv_ref ) + diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 6ae0f461e..7f3f897d5 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -754,6 +754,9 @@ static func_t bli_gks_l1v_kers[BLIS_NUM_LEVEL1V_KERS] = /* addv */ { { BLIS_SADDV_KERNEL, BLIS_CADDV_KERNEL, BLIS_DADDV_KERNEL, BLIS_ZADDV_KERNEL, } }, +/* amaxv */ { { BLIS_SAMAXV_KERNEL, BLIS_CAMAXV_KERNEL, + BLIS_DAMAXV_KERNEL, BLIS_ZAMAXV_KERNEL, } + }, /* axpbyv */ { { BLIS_SAXPBYV_KERNEL, BLIS_CAXPBYV_KERNEL, BLIS_DAXPBYV_KERNEL, BLIS_ZAXPBYV_KERNEL, } }, @@ -798,6 +801,9 @@ static func_t bli_gks_l1v_ref_kers[BLIS_NUM_LEVEL1V_KERS] = /* addv */ { { BLIS_SADDV_KERNEL_REF, BLIS_CADDV_KERNEL_REF, BLIS_DADDV_KERNEL_REF, BLIS_ZADDV_KERNEL_REF, } }, +/* amaxv */ { { BLIS_SAMAXV_KERNEL_REF, BLIS_CAMAXV_KERNEL_REF, + BLIS_DAMAXV_KERNEL_REF, BLIS_ZAMAXV_KERNEL_REF, } + }, /* axpbyv */ { { BLIS_SAXPBYV_KERNEL_REF, BLIS_CAXPBYV_KERNEL_REF, BLIS_DAXPBYV_KERNEL_REF, BLIS_ZAXPBYV_KERNEL_REF, } }, diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index 8951a1d62..738284064 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -740,3 +740,35 @@ void bli_acquire_vpart_b2f bli_acquire_mpart_r2l( req_part, i, b, obj, sub_obj ); } + +// -- Scalar acquisition ------------------------------------------------------- + + +void bli_acquire_mij + ( + dim_t i, + dim_t j, + obj_t* obj, + obj_t* sub_obj + ) +{ + obj_t tmp_obj; + + bli_acquire_mpart_l2r( BLIS_SUBPART1, j, 1, obj, &tmp_obj ); + bli_acquire_mpart_t2b( BLIS_SUBPART1, i, 1, &tmp_obj, sub_obj ); +} + + +void bli_acquire_vi + ( + dim_t i, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( bli_obj_is_col_vector( *obj ) ) + bli_acquire_mpart_t2b( BLIS_SUBPART1, i, 1, obj, sub_obj ); + else // if ( bli_obj_is_row_vector( *obj ) ) + bli_acquire_mpart_l2r( BLIS_SUBPART1, i, 1, obj, sub_obj ); +} + diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h index 0d3d021b4..fd24f1d82 100644 --- a/frame/base/bli_part.h +++ b/frame/base/bli_part.h @@ -76,3 +76,20 @@ GENPROT( acquire_mpart_br2tl ) GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) +// -- Scalar acquisition ------------------------------------------------------- + +void bli_acquire_mij + ( + dim_t i, + dim_t j, + obj_t* obj, + obj_t* sub_obj + ); + +void bli_acquire_vi + ( + dim_t i, + obj_t* obj, + obj_t* sub_obj + ); + diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c index 24aa192e3..1b63e0b7e 100644 --- a/frame/compat/bla_amax.c +++ b/frame/compat/bla_amax.c @@ -80,7 +80,8 @@ f77_int PASTEF772(i,chx,blasname) \ ); \ \ /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) - index. */ \ + index. Also, if the BLAS integer size differs from the BLIS + integer size, that typecast occurs here. */ \ f77_index = bli_index + 1; \ \ /* Finalize BLIS (if it was initialized above). */ \ diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 00a2aa4b9..355412e2b 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -705,6 +705,24 @@ // Level-1v // +// amaxv kernels + +#ifndef BLIS_SAMAXV_KERNEL +#define BLIS_SAMAXV_KERNEL BLIS_SAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_DAMAXV_KERNEL +#define BLIS_DAMAXV_KERNEL BLIS_DAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_CAMAXV_KERNEL +#define BLIS_CAMAXV_KERNEL BLIS_CAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_ZAMAXV_KERNEL +#define BLIS_ZAMAXV_KERNEL BLIS_ZAMAXV_KERNEL_REF +#endif + // addv kernels #ifndef BLIS_SADDV_KERNEL diff --git a/frame/include/bli_kernel_pre_macro_defs.h b/frame/include/bli_kernel_pre_macro_defs.h index 98e4c3928..30ed3e3f2 100644 --- a/frame/include/bli_kernel_pre_macro_defs.h +++ b/frame/include/bli_kernel_pre_macro_defs.h @@ -260,6 +260,13 @@ #define BLIS_CADDV_KERNEL_REF bli_caddv_ref #define BLIS_ZADDV_KERNEL_REF bli_zaddv_ref +// amaxv kernels + +#define BLIS_SAMAXV_KERNEL_REF bli_samaxv_ref +#define BLIS_DAMAXV_KERNEL_REF bli_damaxv_ref +#define BLIS_CAMAXV_KERNEL_REF bli_camaxv_ref +#define BLIS_ZAMAXV_KERNEL_REF bli_zamaxv_ref + // axpbyv kernels #define BLIS_SAXPBYV_KERNEL_REF bli_saxpbyv_ref diff --git a/frame/include/bli_kernel_prototypes.h b/frame/include/bli_kernel_prototypes.h index e693825ff..b788bbc1c 100644 --- a/frame/include/bli_kernel_prototypes.h +++ b/frame/include/bli_kernel_prototypes.h @@ -164,6 +164,11 @@ #define bli_caddv_ker_name BLIS_CADDV_KERNEL #define bli_zaddv_ker_name BLIS_ZADDV_KERNEL +#define bli_samaxv_ker_name BLIS_SAMAXV_KERNEL +#define bli_damaxv_ker_name BLIS_DAMAXV_KERNEL +#define bli_camaxv_ker_name BLIS_CAMAXV_KERNEL +#define bli_zamaxv_ker_name BLIS_ZAMAXV_KERNEL + #define bli_saxpbyv_ker_name BLIS_SAXPBYV_KERNEL #define bli_daxpbyv_ker_name BLIS_DAXPBYV_KERNEL #define bli_caxpbyv_ker_name BLIS_CAXPBYV_KERNEL diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 8869cea17..50ddd5d1f 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -1104,6 +1104,14 @@ else if ( bli_is_scomplex( dt ) ) PASTEMAC(c,fname)(o0,o1,o2); \ else if ( bli_is_dcomplex( dt ) ) PASTEMAC(z,fname)(o0,o1,o2); \ } +#define bli_call_ft_3i( dt, fname, o0, o1, o2 ) \ +{ \ + if ( bli_is_float( dt ) ) PASTEMAC(s,fname)(o0,o1,o2); \ + else if ( bli_is_double( dt ) ) PASTEMAC(d,fname)(o0,o1,o2); \ + else if ( bli_is_scomplex( dt ) ) PASTEMAC(c,fname)(o0,o1,o2); \ + else if ( bli_is_dcomplex( dt ) ) PASTEMAC(z,fname)(o0,o1,o2); \ + else if ( bli_is_int( dt ) ) PASTEMAC(i,fname)(o0,o1,o2); \ +} #define bli_call_ft_4( dt, fname, o0, o1, o2, o3 ) \ { \ if ( bli_is_float( dt ) ) PASTEMAC(s,fname)(o0,o1,o2,o3); \ diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 086740cfd..f4e3e4aa0 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -586,6 +586,7 @@ typedef enum typedef enum { BLIS_ADDV_KER = 0, + BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, @@ -600,7 +601,7 @@ typedef enum BLIS_XPBYV_KER, } l1vkr_t; -#define BLIS_NUM_LEVEL1V_KERS 13 +#define BLIS_NUM_LEVEL1V_KERS 14 typedef enum diff --git a/frame/include/level0/bli_gets.h b/frame/include/level0/bli_gets.h index 36e9af5c3..92d018159 100644 --- a/frame/include/level0/bli_gets.h +++ b/frame/include/level0/bli_gets.h @@ -46,27 +46,38 @@ #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } + +#define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } +#define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } +#define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } +#define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } +#define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) +#define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif diff --git a/frame/include/level0/bli_sets.h b/frame/include/level0/bli_sets.h index 551d03025..61bd7e426 100644 --- a/frame/include/level0/bli_sets.h +++ b/frame/include/level0/bli_sets.h @@ -45,11 +45,13 @@ #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } +#define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } +#define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX @@ -57,11 +59,13 @@ #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } +#define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } +#define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX @@ -77,11 +81,18 @@ #endif // BLIS_ENABLE_C99_COMPLEX +#define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } +#define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } +#define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } +#define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } +#define bli_iisets( xr, xi, y ) { (y) = (xr); } + #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) +#define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif diff --git a/frame/include/old/bli_kernel_post_macro_defs.h b/frame/include/old/bli_kernel_post_macro_defs.h deleted file mode 100644 index 4a261b033..000000000 --- a/frame/include/old/bli_kernel_post_macro_defs.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_POST_MACRO_DEFS_H -#define BLIS_KERNEL_POST_MACRO_DEFS_H - - -// -- Maximum register blocksize search ---------------------------------------- - -// The macro-kernels oftentimes need to statically allocate a temporary -// MR x NR micro-tile of C. This micro-tile must be sized such that it will -// work for both native and induced implementations, since the user can switch -// between them at runtime. In order to facilitate the sizing of those -// micro-tiles, we must determine the largest the register blocksizes would -// need to be to accommodate both native and induced-based complex -// micro-kernels. For real datatypes, the maximum is never larger than the -// actual s and d register blocksizes. However, for complex datatypes, the -// "native" register blocksizes may differ from the "virtual" register -// blocksizes used by the induced implementations. Usually, it is the register -// blocksizes used for induced-based complex micro-kernels that would be -// larger, and thus determine the maximum for c and z datatypes. But, we -// prefer not to assume this, therefore, we always take the larger of the -// two values. - -#define BLIS_DEFAULT_IND_MR_C BLIS_DEFAULT_MR_S -#define BLIS_DEFAULT_IND_NR_C BLIS_DEFAULT_NR_S -#define BLIS_DEFAULT_IND_MR_Z BLIS_DEFAULT_MR_D -#define BLIS_DEFAULT_IND_NR_Z BLIS_DEFAULT_NR_D - -// -// Find the largest register blocksize MR. -// - -#define BLIS_MAX_DEFAULT_MR_S BLIS_DEFAULT_MR_S -#define BLIS_MAX_DEFAULT_MR_D BLIS_DEFAULT_MR_D - -// Choose between the native and induced blocksize for scomplex. -#define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_MR_C -#if BLIS_DEFAULT_IND_MR_C > BLIS_MAX_DEFAULT_MR_C -#undef BLIS_MAX_DEFAULT_MR_C -#define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_IND_MR_C -#endif - -// Choose between the native and induced blocksize for dcomplex. -#define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_MR_Z -#if BLIS_DEFAULT_IND_MR_Z > BLIS_MAX_DEFAULT_MR_Z -#undef BLIS_MAX_DEFAULT_MR_Z -#define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_IND_MR_Z -#endif - -// -// Find the largest register blocksize NR. -// - -#define BLIS_MAX_DEFAULT_NR_S BLIS_DEFAULT_NR_S -#define BLIS_MAX_DEFAULT_NR_D BLIS_DEFAULT_NR_D - -// Choose between the native and induced blocksize for scomplex. -#define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_NR_C -#if BLIS_DEFAULT_IND_NR_C > BLIS_MAX_DEFAULT_NR_C -#undef BLIS_MAX_DEFAULT_NR_C -#define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_IND_NR_C -#endif - -// Choose between the native and induced blocksize for dcomplex. -#define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_NR_Z -#if BLIS_DEFAULT_IND_NR_Z > BLIS_MAX_DEFAULT_NR_Z -#undef BLIS_MAX_DEFAULT_NR_Z -#define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_IND_NR_Z -#endif - - -// -- Abbreiviated macros ------------------------------------------------------ - -// Here, we shorten the maximum blocksizes found above so that they can be -// derived via the PASTEMAC macro. - -// Maximum MR blocksizes - -#define bli_smaxmr BLIS_MAX_DEFAULT_MR_S -#define bli_dmaxmr BLIS_MAX_DEFAULT_MR_D -#define bli_cmaxmr BLIS_MAX_DEFAULT_MR_C -#define bli_zmaxmr BLIS_MAX_DEFAULT_MR_Z - -// Maximum NR blocksizes - -#define bli_smaxnr BLIS_MAX_DEFAULT_NR_S -#define bli_dmaxnr BLIS_MAX_DEFAULT_NR_D -#define bli_cmaxnr BLIS_MAX_DEFAULT_NR_C -#define bli_zmaxnr BLIS_MAX_DEFAULT_NR_Z - - -#endif - diff --git a/frame/include/old/bli_kernel_prototypes.h b/frame/include/old/bli_kernel_prototypes.h deleted file mode 100644 index 333b2c578..000000000 --- a/frame/include/old/bli_kernel_prototypes.h +++ /dev/null @@ -1,529 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_PROTOTYPES_H -#define BLIS_KERNEL_PROTOTYPES_H - - -// -- Define PASTEMAC-friendly kernel function name macros --------------------- - -// -// Level-3 -// - -// gemm micro-kernels - -#define bli_sGEMM_UKERNEL BLIS_SGEMM_UKERNEL -#define bli_dGEMM_UKERNEL BLIS_DGEMM_UKERNEL -#define bli_cGEMM_UKERNEL BLIS_CGEMM_UKERNEL -#define bli_zGEMM_UKERNEL BLIS_ZGEMM_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a, \ - ctype* b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMM_UKERNEL ) - -// gemmtrsm_l micro-kernels - -#define bli_sGEMMTRSM_L_UKERNEL BLIS_SGEMMTRSM_L_UKERNEL -#define bli_dGEMMTRSM_L_UKERNEL BLIS_DGEMMTRSM_L_UKERNEL -#define bli_cGEMMTRSM_L_UKERNEL BLIS_CGEMMTRSM_L_UKERNEL -#define bli_zGEMMTRSM_L_UKERNEL BLIS_ZGEMMTRSM_L_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a10, \ - ctype* a11, \ - ctype* b01, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMMTRSM_L_UKERNEL ) - -// gemmtrsm_u micro-kernels - -#define bli_sGEMMTRSM_U_UKERNEL BLIS_SGEMMTRSM_U_UKERNEL -#define bli_dGEMMTRSM_U_UKERNEL BLIS_DGEMMTRSM_U_UKERNEL -#define bli_cGEMMTRSM_U_UKERNEL BLIS_CGEMMTRSM_U_UKERNEL -#define bli_zGEMMTRSM_U_UKERNEL BLIS_ZGEMMTRSM_U_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a12, \ - ctype* a11, \ - ctype* b21, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMMTRSM_U_UKERNEL ) - -// trsm_l micro-kernels - -#define bli_sTRSM_L_UKERNEL BLIS_STRSM_L_UKERNEL -#define bli_dTRSM_L_UKERNEL BLIS_DTRSM_L_UKERNEL -#define bli_cTRSM_L_UKERNEL BLIS_CTRSM_L_UKERNEL -#define bli_zTRSM_L_UKERNEL BLIS_ZTRSM_L_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - ctype* a11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( TRSM_L_UKERNEL ) - -// trsm_u micro-kernels - -#define bli_sTRSM_U_UKERNEL BLIS_STRSM_U_UKERNEL -#define bli_dTRSM_U_UKERNEL BLIS_DTRSM_U_UKERNEL -#define bli_cTRSM_U_UKERNEL BLIS_CTRSM_U_UKERNEL -#define bli_zTRSM_U_UKERNEL BLIS_ZTRSM_U_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - ctype* a11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( TRSM_U_UKERNEL ) - - -// -// Level-1m -// - -// NOTE: We don't need any PASTEMAC-friendly aliases to packm kernel -// macros because they are used directly in the initialization of the -// function pointer array, rather than via a templatizing wrapper macro. - - -// -// Level-1f -// - -// axpy2v kernels - -#define bli_sssAXPY2V_KERNEL BLIS_SAXPY2V_KERNEL -#define bli_dddAXPY2V_KERNEL BLIS_DAXPY2V_KERNEL -#define bli_cccAXPY2V_KERNEL BLIS_CAXPY2V_KERNEL -#define bli_zzzAXPY2V_KERNEL BLIS_ZAXPY2V_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_xy* alpha1, \ - ctype_xy* alpha2, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_z* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( AXPY2V_KERNEL ) - -// dotaxpyv kernels - -#define bli_sssDOTAXPYV_KERNEL BLIS_SDOTAXPYV_KERNEL -#define bli_dddDOTAXPYV_KERNEL BLIS_DDOTAXPYV_KERNEL -#define bli_cccDOTAXPYV_KERNEL BLIS_CDOTAXPYV_KERNEL -#define bli_zzzDOTAXPYV_KERNEL BLIS_ZDOTAXPYV_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,kername) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype_x* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_xy* rho, \ - ctype_z* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTAXPYV_KERNEL ) - -// axpyf kernels - -#define bli_sssAXPYF_KERNEL BLIS_SAXPYF_KERNEL -#define bli_dddAXPYF_KERNEL BLIS_DAXPYF_KERNEL -#define bli_cccAXPYF_KERNEL BLIS_CAXPYF_KERNEL -#define bli_zzzAXPYF_KERNEL BLIS_ZAXPYF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( AXPYF_KERNEL ) - -// dotxf kernels - -#define bli_sssDOTXF_KERNEL BLIS_SDOTXF_KERNEL -#define bli_dddDOTXF_KERNEL BLIS_DDOTXF_KERNEL -#define bli_cccDOTXF_KERNEL BLIS_CDOTXF_KERNEL -#define bli_zzzDOTXF_KERNEL BLIS_ZDOTXF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_x* x, inc_t incx, \ - ctype_y* beta, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXF_KERNEL ) - -// dotxaxpyf kernels - -#define bli_sssDOTXAXPYF_KERNEL BLIS_SDOTXAXPYF_KERNEL -#define bli_dddDOTXAXPYF_KERNEL BLIS_DDOTXAXPYF_KERNEL -#define bli_cccDOTXAXPYF_KERNEL BLIS_CDOTXAXPYF_KERNEL -#define bli_zzzDOTXAXPYF_KERNEL BLIS_ZDOTXAXPYF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, kername ) \ -\ -void PASTEMAC3(cha,chb,chc,kername) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ab* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_b* w, inc_t incw, \ - ctype_b* x, inc_t incx, \ - ctype_c* beta, \ - ctype_c* y, inc_t incy, \ - ctype_c* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXAXPYF_KERNEL ) - - -// -// Level-1v -// - -// addv kernels - -#define bli_ssADDV_KERNEL BLIS_SADDV_KERNEL -#define bli_ddADDV_KERNEL BLIS_DADDV_KERNEL -#define bli_ccADDV_KERNEL BLIS_CADDV_KERNEL -#define bli_zzADDV_KERNEL BLIS_ZADDV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( ADDV_KERNEL ) - -// axpyv kernels - -#define bli_sssAXPYV_KERNEL BLIS_SAXPYV_KERNEL -#define bli_dddAXPYV_KERNEL BLIS_DAXPYV_KERNEL -#define bli_cccAXPYV_KERNEL BLIS_CAXPYV_KERNEL -#define bli_zzzAXPYV_KERNEL BLIS_ZAXPYV_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_a* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( AXPYV_KERNEL ) - -// copyv kernels - -#define bli_ssCOPYV_KERNEL BLIS_SCOPYV_KERNEL -#define bli_ddCOPYV_KERNEL BLIS_DCOPYV_KERNEL -#define bli_ccCOPYV_KERNEL BLIS_CCOPYV_KERNEL -#define bli_zzCOPYV_KERNEL BLIS_ZCOPYV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( COPYV_KERNEL ) - -// dotv kernels - -#define bli_sssDOTV_KERNEL BLIS_SDOTV_KERNEL -#define bli_dddDOTV_KERNEL BLIS_DDOTV_KERNEL -#define bli_cccDOTV_KERNEL BLIS_CDOTV_KERNEL -#define bli_zzzDOTV_KERNEL BLIS_ZDOTV_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, kername ) \ -\ -void PASTEMAC3(chx,chy,chr,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_r* rho \ - ); - -INSERT_GENTPROT3_BASIC( DOTV_KERNEL ) - -// dotxv kernels - -#define bli_sssDOTXV_KERNEL BLIS_SDOTXV_KERNEL -#define bli_dddDOTXV_KERNEL BLIS_DDOTXV_KERNEL -#define bli_cccDOTXV_KERNEL BLIS_CDOTXV_KERNEL -#define bli_zzzDOTXV_KERNEL BLIS_ZDOTXV_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chr,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_xy* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_r* beta, \ - ctype_r* rho \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXV_KERNEL ) - -// invertv kernels - -#define bli_sINVERTV_KERNEL BLIS_SINVERTV_KERNEL -#define bli_dINVERTV_KERNEL BLIS_DINVERTV_KERNEL -#define bli_cINVERTV_KERNEL BLIS_CINVERTV_KERNEL -#define bli_zINVERTV_KERNEL BLIS_ZINVERTV_KERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - ); - -INSERT_GENTPROT_BASIC( INVERTV_KERNEL ) - -// scal2v kernels - -#define bli_sssSCAL2V_KERNEL BLIS_SSCAL2V_KERNEL -#define bli_dddSCAL2V_KERNEL BLIS_DSCAL2V_KERNEL -#define bli_cccSCAL2V_KERNEL BLIS_CSCAL2V_KERNEL -#define bli_zzzSCAL2V_KERNEL BLIS_ZSCAL2V_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, kername ) \ -\ -void PASTEMAC3(chb,chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( SCAL2V_KERNEL ) - -// scalv kernels - -#define bli_ssSCALV_KERNEL BLIS_SSCALV_KERNEL -#define bli_ddSCALV_KERNEL BLIS_DSCALV_KERNEL -#define bli_ccSCALV_KERNEL BLIS_CSCALV_KERNEL -#define bli_zzSCALV_KERNEL BLIS_ZSCALV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ -\ -void PASTEMAC2(chb,chx,kername) \ - ( \ - conj_t conjbeta, \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx \ - ); - -INSERT_GENTPROT2_BASIC( SCALV_KERNEL ) - -// setv kernels - -#define bli_ssSETV_KERNEL BLIS_SSETV_KERNEL -#define bli_ddSETV_KERNEL BLIS_DSETV_KERNEL -#define bli_ccSETV_KERNEL BLIS_CSETV_KERNEL -#define bli_zzSETV_KERNEL BLIS_ZSETV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ -\ -void PASTEMAC2(chb,chx,kername) \ - ( \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx \ - ); - -INSERT_GENTPROT2_BASIC( SETV_KERNEL ) - -// subv kernels - -#define bli_ssSUBV_KERNEL BLIS_SSUBV_KERNEL -#define bli_ddSUBV_KERNEL BLIS_DSUBV_KERNEL -#define bli_ccSUBV_KERNEL BLIS_CSUBV_KERNEL -#define bli_zzSUBV_KERNEL BLIS_ZSUBV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( SUBV_KERNEL ) - -// swapv kernels - -#define bli_ssSWAPV_KERNEL BLIS_SSWAPV_KERNEL -#define bli_ddSWAPV_KERNEL BLIS_DSWAPV_KERNEL -#define bli_ccSWAPV_KERNEL BLIS_CSWAPV_KERNEL -#define bli_zzSWAPV_KERNEL BLIS_ZSWAPV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( SWAPV_KERNEL ) - - - -#endif - diff --git a/frame/include/old/bli_kernel_type_defs.h b/frame/include/old/bli_kernel_type_defs.h deleted file mode 100644 index e0190fe1b..000000000 --- a/frame/include/old/bli_kernel_type_defs.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_TYPE_DEFS_H -#define BLIS_KERNEL_TYPE_DEFS_H - - -// -// -- BLIS kernel types -------------------------------------------------------- -// - -// Here we generate typedef statements that generate custom types for -// kernel function pointers. Note that we use the function -// prototype-generating macro since it takes the same arguments we need -// to define our types. - -// -- Level-3 kernels -- - -/* -// gemm - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_ukr_t ) - - -// trsm_l/u - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( trsm_ukr_t ) - - -// gemmtrsm_l/u - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1x, \ - ctype* restrict a11, \ - ctype* restrict bx1, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemmtrsm_ukr_t ) - -// -- packm kernels -- - -// packm_struc_cxk - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p \ - ); - -INSERT_GENTPROT_BASIC( packm_ker_t ) -*/ - - - -#endif - diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c index 760e869b8..7a471995d 100644 --- a/frame/util/bli_util_check.c +++ b/frame/util/bli_util_check.c @@ -38,21 +38,6 @@ // Define object-based check functions. // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* index \ - ) \ -{ \ - bli_utilv_xi_check( x, index ); \ -} - -GENFRONT( amaxv ) - - #undef GENFRONT #define GENFRONT( opname ) \ \ @@ -172,42 +157,6 @@ GENFRONT( sumsqv ) // ----------------------------------------------------------------------------- -void bli_utilv_xi_check - ( - obj_t* x, - obj_t* index - ) -{ - err_t e_val; - - // Check object datatypes. - - e_val = bli_check_floating_object( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_integer_object( index ); - bli_check_error_code( e_val ); - - e_val = bli_check_nonconstant_object( index ); - bli_check_error_code( e_val ); - - // Check object dimensions. - - e_val = bli_check_vector_object( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_scalar_object( index ); - bli_check_error_code( e_val ); - - // Check object buffers (for non-NULLness). - - e_val = bli_check_object_buffer( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_object_buffer( index ); - bli_check_error_code( e_val ); -} - void bli_utilv_xa_check ( obj_t* x, diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h index 0fb23bccd..364ab5923 100644 --- a/frame/util/bli_util_check.h +++ b/frame/util/bli_util_check.h @@ -37,18 +37,6 @@ // Prototype object-based check functions. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* index \ - ); - -GENPROT( amaxv ) - - #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c index abac92b26..2942616c1 100644 --- a/frame/util/bli_util_oapi.c +++ b/frame/util/bli_util_oapi.c @@ -40,44 +40,6 @@ // Define object-based interfaces. // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_CNTX_PARAM \ - ) \ -{ \ - BLIS_OAPI_CNTX_DECL \ -\ - num_t dt = bli_obj_datatype( *x ); \ -\ - dim_t n = bli_obj_vector_dim( *x ); \ - void* buf_x = bli_obj_buffer_at_off( *x ); \ - inc_t incx = bli_obj_vector_inc( *x ); \ -\ - void* buf_index = bli_obj_buffer_at_off( *index ); \ -\ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, index ); \ -\ - /* Invoke the typed function. */ \ - bli_call_ft_5 \ - ( \ - dt, \ - opname, \ - n, \ - buf_x, incx, \ - buf_index, \ - cntx \ - ); \ -} - -GENFRONT( amaxv ) - - #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h index 9de0afadb..f669271fa 100644 --- a/frame/util/bli_util_oapi.h +++ b/frame/util/bli_util_oapi.h @@ -37,19 +37,6 @@ // Prototype object-based interfaces. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_CNTX_PARAM \ - ); - -GENPROT( amaxv ) - - #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index 8fa89d9ae..ad2bb0b40 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -38,50 +38,6 @@ // Define BLAS-like interfaces with typed operands. // -#undef GENTFUNCI -#define GENTFUNCI( ctype, ctype_i, ch, chi, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* index, \ - cntx_t* cntx \ - ) \ -{ \ - cntx_t* cntx_p = cntx; \ -\ - /* If the vector length is zero, set the index to zero and return - early. This directly emulatess the behavior of netlib LAPACK's - i?amax() routines. */ \ - if ( bli_zero_dim1( n ) ) \ - { \ - ctype_i* zero_i = PASTEMAC(chi,0); \ -\ - PASTEMAC(chi,copys)( *zero_i, *index ); \ - return; \ - } \ -\ - /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ -\ - /* Invoke the helper variant, which loops over the appropriate kernel - to implement the current operation. */ \ - PASTEMAC2(ch,opname,_unb_var1) \ - ( \ - n, \ - x, incx, \ - index, \ - cntx_p \ - ); \ -\ - /* Finalize the context if it was initialized locally. */ \ - /*bli_cntx_finalize_local_if( opname, cntx );*/ \ -} - -INSERT_GENTFUNCI_BASIC0( amaxv ) - - #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h index e7dbc73e7..1f3d48a7c 100644 --- a/frame/util/bli_util_tapi.h +++ b/frame/util/bli_util_tapi.h @@ -37,20 +37,6 @@ // Prototype BLAS-like interfaces with typed operands. // -#undef GENTPROTI -#define GENTPROTI( ctype, ctype_i, ch, chi, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* index, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTI_BASIC( amaxv ) - - #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 1ed142a7c..73f17ba29 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -38,71 +38,6 @@ // Define BLAS-like interfaces with typed operands. // -#undef GENTFUNCRI -#define GENTFUNCRI( ctype, ctype_r, ctype_i, ch, chr, chi, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* abmax_i, \ - cntx_t* cntx \ - ) \ -{ \ - ctype_r* minus_one = PASTEMAC(chr,m1); \ - ctype_i* zero_i = PASTEMAC(chi,0); \ -\ - ctype* chi1; \ - ctype_r chi1_r; \ - ctype_r chi1_i; \ - ctype_r abs_chi1; \ - ctype_r abs_chi1_max; \ - ctype_i i_max; \ - dim_t i; \ -\ - /* Initialize the index of the maximum absolute value to zero. */ \ - PASTEMAC(chi,copys)( *zero_i, i_max ); \ -\ - /* Initialize the maximum absolute value search candidate with - -1, which is guaranteed to be less than all values we will - compute. */ \ - PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ -\ - for ( i = 0; i < n; ++i ) \ - { \ - chi1 = x + (i )*incx; \ -\ - /* Get the real and imaginary components of chi1. */ \ - PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ -\ - /* Replace chi1_r and chi1_i with their absolute values. */ \ - PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ - PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ -\ - /* Add the real and imaginary absolute values together. */ \ - PASTEMAC(chr,set0s)( abs_chi1 ); \ - PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ - PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ -\ - /* If the absolute value of the current element exceeds that of - the previous largest, save it and its index. If NaN is - encountered, then treat it the same as if it were a valid - value that was smaller than any previously seen. This - behavior mimics that of LAPACK's ?lange(). */ \ - if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ - { \ - PASTEMAC(chr,copys)( abs_chi1, abs_chi1_max ); \ - PASTEMAC(chi,copys)( i, i_max ); \ - } \ - } \ -\ - /* Store final index to output variable. */ \ - PASTEMAC(chi,copys)( i_max, *abmax_i ); \ -} - -INSERT_GENTFUNCRI_BASIC0( amaxv_unb_var1 ) - - #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h index 369f5f650..09ca31d76 100644 --- a/frame/util/bli_util_unb_var1.h +++ b/frame/util/bli_util_unb_var1.h @@ -37,20 +37,6 @@ // Prototype BLAS-like interfaces with typed operands. // -#undef GENTPROTRI -#define GENTPROTRI( ctype, ctype_r, ctype_i, ch, chr, chi, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* abmax_i, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTRI_BASIC( amaxv_unb_var1 ) - - #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/testsuite/input.operations b/testsuite/input.operations index 058721632..ac9298f8b 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -107,6 +107,10 @@ -1 # dimensions: m ? # parameters: conjx +1 # amaxv +1 # test sequential front-end +-1 # dimensions: m + 1 # axpbyv 1 # test sequential front-end -1 # dimensions: m diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c new file mode 100644 index 000000000..9323ecbba --- /dev/null +++ b/testsuite/src/test_amaxv.c @@ -0,0 +1,400 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "amaxv"; +static char* o_types = "v"; // x +static char* p_types = ""; // (no parameters) +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_amaxv_deps + ( + test_params_t* params, + test_op_t* op + ); + +void libblis_test_amaxv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + num_t datatype, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_amaxv_impl + ( + iface_t iface, + obj_t* x, + obj_t* index + ); + +void libblis_test_amaxv_check + ( + test_params_t* params, + obj_t* x, + obj_t* index, + double* resid + ); + +void bli_amaxv_test + ( + obj_t* x, + obj_t* index + ); + + + +void libblis_test_amaxv_deps + ( + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( params, &(op->ops->randv) ); +} + + + +void libblis_test_amaxv + ( + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( op->test_done == TRUE ) return; + + // Return early if operation is disabled. + if ( op->op_switch == DISABLE_ALL || + op->ops->l1v_over == DISABLE_ALL ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_amaxv_deps( params, op ); + + // Execute the test driver for each implementation requested. + if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_amaxv_experiment ); + } +} + + + +void libblis_test_amaxv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + num_t datatype, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + dim_t m; + + obj_t x; + obj_t index; + + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + + + // Create test scalars. + bli_obj_scalar_init_detached( BLIS_INT, &index ); + + // Create test operands (vectors and/or matrices). + libblis_test_vobj_create( params, datatype, sc_str[0], m, &x ); + + // Randomize x. + libblis_test_vobj_randomize( params, FALSE, &x ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + time = bli_clock(); + + libblis_test_amaxv_impl( iface, &x, &index ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( x ) ) *perf *= 2.0; + + // Perform checks. + libblis_test_amaxv_check( params, &x, &index, resid ); + + // Zero out performance and residual if input vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + + // Free the test objects. + bli_obj_free( &x ); +} + + + +void libblis_test_amaxv_impl + ( + iface_t iface, + obj_t* x, + obj_t* index + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_amaxv( x, index ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_amaxv_check + ( + test_params_t* params, + obj_t* x, + obj_t* index, + double* resid + ) +{ + obj_t index_test; + obj_t chi_i; + obj_t chi_i_test; + dim_t i; + dim_t i_test; + + double i_d, junk; + double i_d_test; + + // + // Pre-conditions: + // - x is randomized. + // + // Under these conditions, we assume that the implementation for + // + // index := amaxv( x ) + // + // is functioning correctly if + // + // x[ index ] = max( x ) + // + // where max() is implemented via the bli_?amaxv_test() function. + // + + // The following two calls have already been made by the caller. That + // is, the index object has already been created and the library's + // amaxv implementation has already been tested. + //bli_obj_scalar_init_detached( BLIS_INT, &index ); + //bli_amaxv( x, &index ); + bli_getsc( index, &i_d, &junk ); i = i_d; + bli_acquire_vi( i, x, &chi_i ); + + bli_obj_scalar_init_detached( BLIS_INT, &index_test ); + bli_amaxv_test( x, &index_test ); + bli_getsc( &index_test, &i_d_test, &junk ); i_test = i_d_test; + bli_acquire_vi( i_test, x, &chi_i_test ); + + // Verify that the values referenced by index and index_test are equal. + if ( bli_obj_equals( &chi_i, &chi_i_test ) ) *resid = 0.0; + else *resid = 1.0; +} + +// ----------------------------------------------------------------------------- + +// +// Prototype BLAS-like interfaces with typed operands for a local amaxv test +// operation +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); \ + +INSERT_GENTPROT_BASIC( amaxv_test ) + +// +// Define object-based interface for a local amaxv test operation. +// + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* x, \ + obj_t* index \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *x ); \ +\ + dim_t n = bli_obj_vector_dim( *x ); \ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_index = bli_obj_buffer_at_off( *index ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + bli_amaxv_check( x, index ); \ +\ + /* Invoke the bli_?amaxv_test() function. */ \ + bli_call_ft_5 \ + ( \ + dt, \ + amaxv_test, \ + n, \ + buf_x, incx, \ + buf_index, \ + NULL \ + ); \ +} + +GENFRONT( amaxv_test ) + +// +// Define BLAS-like interfaces with typed operands for a local amaxv test +// operation. +// NOTE: This is based on a simplified version of the bli_?amaxv_ref() +// reference kernel. +// + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* i_max, \ + cntx_t* cntx \ + ) \ +{ \ + ctype_r* minus_one = PASTEMAC(chr,m1); \ + dim_t* zero_i = PASTEMAC(i,0); \ +\ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r abs_chi1; \ + ctype_r abs_chi1_max; \ + dim_t i; \ +\ + /* Initialize the index of the maximum absolute value to zero. */ \ + PASTEMAC(i,copys)( zero_i, *i_max ); \ +\ + /* If the vector length is zero, return early. This directly emulates + the behavior of netlib BLAS's i?amax() routines. */ \ + if ( bli_zero_dim1( n ) ) return; \ +\ + /* Initialize the maximum absolute value search candidate with + -1, which is guaranteed to be less than all values we will + compute. */ \ + PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ +\ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + ctype* chi1 = x + (i )*incx; \ +\ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCR_BASIC0( amaxv_test ) + diff --git a/testsuite/src/test_amaxv.h b/testsuite/src/test_amaxv.h new file mode 100644 index 000000000..364b27963 --- /dev/null +++ b/testsuite/src/test_amaxv.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_amaxv + ( + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index e22bb52df..bd14d13b4 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -113,6 +113,7 @@ void libblis_test_utility_ops( test_params_t* params, test_ops_t* ops ) void libblis_test_level1v_ops( test_params_t* params, test_ops_t* ops ) { libblis_test_addv( params, &(ops->addv) ); + libblis_test_amaxv( params, &(ops->amaxv) ); libblis_test_axpbyv( params, &(ops->axpbyv) ); libblis_test_axpyv( params, &(ops->axpyv) ); libblis_test_copyv( params, &(ops->copyv) ); @@ -222,6 +223,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) // Level-1v libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->addv) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 0, &(ops->amaxv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->axpbyv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->axpyv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->copyv) ); @@ -1946,8 +1948,8 @@ void libblis_test_vobj_randomize( test_params_t* params, bool_t normalize, obj_t bli_obj_scalar_init_detached( dt, &kappa ); bli_obj_scalar_init_detached( dt_r, &kappa_r ); - // Normalize vector elements. - //bli_setsc( 1.0/( double )bli_obj_vector_dim( *x ), 0.0, &kappa ); + // Normalize vector elements. The following code ensures that we + // always invert-scale by whole power of two. bli_normfv( x, &kappa_r ); libblis_test_ceil_pow2( &kappa_r ); bli_copysc( &kappa_r, &kappa ); diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 07ffcd106..6ecc72d56 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -215,6 +215,7 @@ typedef struct test_ops_s // level-1v test_op_t addv; + test_op_t amaxv; test_op_t axpbyv; test_op_t axpyv; test_op_t copyv; @@ -435,6 +436,7 @@ void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid ); // Level-1v #include "test_addv.h" +#include "test_amaxv.h" #include "test_axpbyv.h" #include "test_axpyv.h" #include "test_copyv.h" From 866b2dde3f41760121115fb25f096d4344e8b4f9 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 5 Oct 2016 14:41:34 -0500 Subject: [PATCH 16/27] Version file update (0.2.1) --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index 2bfe0beaa..0c62199f1 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.0-37 +0.2.1 From 4fb9b4ef2e4cf2626a6e000a41628fb823f16da8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 5 Oct 2016 14:41:35 -0500 Subject: [PATCH 17/27] CHANGELOG update (0.2.1) --- CHANGELOG | 1064 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1057 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 539067456..a361ceac3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,1054 @@ -commit 898614a555ea0aa7de4ca07bb3cb8f5708b6a002 (HEAD -> master, tag: 0.2.0) +commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (HEAD -> master, tag: 0.2.1) +Author: Field G. Van Zee +Date: Wed Oct 5 14:41:34 2016 -0500 + + Version file update (0.2.1) + +commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 (origin/master) +Merge: 8696987 6f71cd3 +Author: Field G. Van Zee +Date: Wed Oct 5 13:35:01 2016 -0500 + + Merge branch 'compose' + +commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose) +Merge: c0630c4 8d55033 +Author: Field G. Van Zee +Date: Tue Oct 4 15:53:46 2016 -0500 + + Merge pull request #94 from flame/distcomm + + Implemented distributed thrinfo_t management. + +commit 86969873b5b861966d717d8f9f370af39e3d9de6 +Author: Field G. Van Zee +Date: Tue Oct 4 14:24:59 2016 -0500 + + Reclassified amaxv operation as a level-1v kernel. + + Details: + - Moved amaxv from being a utility operation to being a level-1v operation. + This includes the establishment of a new amaxv kernel to live beside all + of the other level-1v kernels. + - Added two new functions to bli_part.c: + bli_acquire_mij() + bli_acquire_vi() + The first acquires a scalar object for the (i,j) element of a matrix, + and the second acquires a scalar object for the ith element of a vector. + - Added integer support to bli_getsc level-0 operation. This involved + adding integer support to the bli_*gets level-0 scalar macros. + - Added a new test module to test amaxv as a level-1v operation. The test + module works by comparing the value identified by bli_amaxv() to the + the value found from a reference-like code local to the test module + source file. In other words, it (intentionally) does not guarantee the + same index is found; only the same value. This allows for different + implementations in the case where a vector contains two or more elements + containing exactly the same floating point value (or values, in the case + of the complex domain). + - Removed the directory frame/include/old/. + +commit 8d55033c966feed99fcca2a58017c3ab5b1646dc (origin/distcomm) +Author: Field G. Van Zee +Date: Tue Sep 27 15:20:58 2016 -0500 + + Implemented distributed thrinfo_t management. + + Details: + - Implemented Ricardo Magana's distributed thread info/communicator + management. Rather that fully construct the thrinfo_t structures, from + root to leaf, prior to spawning threads, the threads individually + construct their thrinfo_t trees (or, chains), and do so incrementally, + as needed, reusing the same structure nodes during subsequent blocked + variant iterations. This required moving the initial creation of the + thrinfo_t structure (now, the root nodes) from the _front() functions + to the bli_l3_thread_decorator(). The incremental "growing" of the tree + is performed in the internal back-end (ie: _int()) function, and so + mostly invisible. Also, the incremental growth of the thrinfo_t tree is + done as a function of the current and parent control tree nodes (as well + as the parent thrinfo_t node), further reinforcing the parallel + relationship between the two data structures. + - Removed the "inner" communicator from thrinfo_t structure definition, + as well as its id. Changed all APIs accordingly. Renamed + bli_thrinfo_needs_free_comms() to bli_thrinfo_needs_free_comm(). + - Defined bli_l3_thrinfo_print_paths(), which prints the information + in an array of thrinfo_t* structure pointers. (Used only as a + debugging/verification tool.) + - Deprecated the following thrinfo_t creation functions: + bli_packm_thrinfo_create() + bli_l3_thrinfo_create() + because they are no longer used. bli_thrinfo_create() is now called + directly when creating thrinfo_t nodes. + +commit fd04869ae4d4a3b0ebb9052557c296456bce7c0d +Author: Field G. Van Zee +Date: Tue Sep 27 14:14:11 2016 -0500 + + Changed configure's 'omp' threading to 'openmp'. + + Details: + - Changed the configure script so that the expected string argument to the + -t (or --enable-threading=) option that enables OpenMP multithreading is + 'openmp'. The previous expected string, 'omp', is still supported but + should be considered deprecated. + +commit 9424af87209e4e435e2e742430945152690170b0 +Merge: efa7341 c0630c4 +Author: Field G. Van Zee +Date: Tue Sep 27 12:51:08 2016 -0500 + + Merge branch 'compose' + +commit efa7341df0b0115926aa8a6e8a4ebfb24fdbf11e +Merge: 121c39d e1453f6 +Author: Field G. Van Zee +Date: Fri Sep 16 11:01:57 2016 -0500 + + Merge pull request #92 from ShadenSmith/readme_fix + + Fixes broken URL in README.md + +commit e1453f68f6afd90ae9a29b7a5faa46aa79bbf741 +Author: Shaden Smith +Date: Fri Sep 16 09:29:28 2016 -0500 + + Fixes broken URL in README.md + +commit c0630c4024b08750043a2942a3e8a037aa6b6259 (compose) +Author: Field G. Van Zee +Date: Mon Sep 12 13:59:02 2016 -0500 + + Added debugging printf()'s to bli_l3_thrinfo.c. + + Details: + - Added optional printf() statements to print out thread communicator + info as the thrinfo_t structure is built in bli_l3_thrinfo.c. + - Minor changes to frame/thread/bli_thrinfo.h. + +commit 7b3bf1ffcd7160ccbf6c2518af6d88f6742e4977 +Merge: 3550981 121c39d +Author: Field G. Van Zee +Date: Tue Sep 6 15:47:13 2016 -0500 + + Merge branch 'master' into compose + +commit 121c39d455f2db6f7ce6802ba7f73ad5e088c68c +Author: Field G. Van Zee +Date: Mon Sep 5 13:11:42 2016 -0500 + + Added complex gemm micro-kernels for haswell. + + Details: + - Defined cgemm (3x8) and zgemm (3x4) micro-kernels for haswell-based + architectures. As with their real domain brethren, these kernels perfer + row storage, (though this doesn't affect most users due to high-level + optimizations in most level-3 operations that induce a transpose to + whatever storage preference the kernel may have). + +commit 35509818cbea1598b123421f81c42120889a03c3 +Author: Field G. Van Zee +Date: Wed Aug 31 17:34:15 2016 -0500 + + Added, moved some thread barriers. + + Details: + - Removed thread barriers from the end of the loop bodies of + bli_gemm_blk_var1(), bli_gemm_blk_var2(), bli_trsm_blk_var1(), + and bli_trsm_blk_var2(). + - Moved the thread barrier at the end of bli_packm_int() to the + end of bli_l3_packm(), and added missing barriers to that function. + - Removed the no longer necessary (and now incorrect) ochief guard + in bli_gemm3m3_packa() on the bli_obj_scalar_reset() on C. + - Thanks to Tyler Smith for help with these changes. + +commit abd61f9fa75d77a96d1491b3e035451ee73238fe +Author: Field G. Van Zee +Date: Tue Aug 30 12:34:19 2016 -0500 + + Updated BLIS4 TOMS citation in README.md. + +commit 701b9aa3ff028decbf90efac0dca5bd64fe26269 +Author: Field G. Van Zee +Date: Fri Aug 26 19:04:45 2016 -0500 + + Redesigned control tree infrastructure. + + Details: + - Altered control tree node struct definitions so that all nodes have the + same struct definition, whose primary fields consist of a blocksize id, + a variant function pointer, a pointer to an optional parameter struct, + and a pointer to a (single) sub-node. This unified control tree type is + now named cntl_t. + - Changed the way control tree nodes are connected, and what computation + they represent, such that, for example, packing operations are now + associated with nodes that are "inline" in the tree, rather than off- + shoot braches. The original tree for the classic Goto gemm algorithm was + expressed (roughly) as: + + blk_var2 -> blk_var3 -> blk_var1 -> ker_var2 + | | + -> packb -> packa + + and now, the same tree would look like: + + blk_var2 -> blk_var3 -> packb -> blk_var1 -> packa -> ker_var2 + + Specifically, the packb and packa nodes perform their respective packing + operations and then recurse (without any loop) to a subproblem. This means + there are now two kinds of level-3 control tree nodes: partitioning and + non-partitioning. The blocked variants are members of the former, because + they iteratively partition off submatrices and perform suboperations on + those partitions, while the packing variants belong to the latter group. + (This change has the effect of allowing greatly simplified initialization + of the nodes, which previously involved setting many unused node fields to + NULL.) + - Changed the way thrinfo_t tree nodes are arranged to mirror the new + connective structure of control trees. That is, packm nodes are no longer + off-shoot branches of the main algorithmic nodes, but rather connected + "inline". + - Simplified control tree creation functions. Partitioning nodes are created + concisely with just a few fields needing initialization. By contrast, the + packing nodes require additional parameters, which are stored in a + packm-specific struct that is tracked via the optional parameters pointer + within the control tree struct. (This parameter struct must always begin + with a uint64_t that contains the byte size of the struct. This allows + us to use a generic function to recursively copy control trees.) gemm, + herk, and trmm control tree creation continues to be consolidated into + a single function, with the operation family being used to select + among the parameter-agnostic macro-kernel wrappers. A single routine, + bli_cntl_free(), is provided to free control trees recursively, whereby + the chief thread within a groups release the blocks associated with + mem_t entries back to the memory broker from which they were acquired. + - Updated internal back-ends, e.g. bli_gemm_int(), to query and call the + function pointer stored in the current control tree node (rather than + index into a local function pointer array). Before being invoked, these + function pointers are first cast to a gemm_voft (for gemm, herk, or trmm + families) or trsm_voft (for trsm family) type, which is defined in + frame/3/bli_l3_var_oft.h. + - Retired herk and trmm internal back-ends, since all execution now flows + through gemm or trsm blocked variants. + - Merged forwards- and backwards-moving variants by querying the direction + from routines as a function of the variant's matrix operands. gemm and + herk always move forward, while trmm and trsm move in a direction that + is dependent on which operand (a or b) is triangular. + - Added functions bli_thread_get_range_mdim(), bli_thread_get_range_ndim(), + each of which takes additional arguments and hides complexity in managing + the difference between the way ranges are computed for the four families + of operations. + - Simplified level-3 blocked variants according to the above changes, so that + the only steps taken are: + 1. Query partitioning direction (forwards or backwards). + 2. Prune unreferenced regions, if they exist. + 3. Determine the thread partitioning sub-ranges. + + 4. Determine the partitioning blocksize (passing in the partitioning + direction) + 5. Acquire the curren iteration's partitions for the matrices affected + by the current variants's partitioning dimension (m, k, n). + 6. Call the subproblem. + + - Instantiate control trees once per thread, per operation invocation. + (This is a change from the previous regime in which control trees were + treated as stateless objects, initialized with the library, and shared + as read-only objects between threads.) This once-per-thread allocation + is done primarily to allow threads to use the control tree as as place + to cache certain data for use in subsequent loop iterations. Presently, + the only application of this caching is a mem_t entry for the packing + blocks checked out from the memory broker (allocator). If a non-NULL + control tree is passed in by the (expert) user, then the tree is copied + by each thread. This is done in bli_l3_thread_decorator(), in + bli_thrcomm_*.c. + - Added a new field to the context, and opid_t which tracks the "family" + of the operation being executed. For example, gemm, hemm, and symm are + all part of the gemm family, while herk, syrk, her2k, and syr2k are + all part of the herk family. Knowing the operation's family is necessary + when conditionally executing the internal (beta) scalar reset on on + C in blocked variant 3, which is needed for gemm and herk families, + but must not be performed for the trmm family (because beta has only + been applied to the current row-panel of C after the first rank-kc + iteration). + - Reexpressed 3m3 induced method blocked variant in frame/3/gemm/ind + to comform with the new control tree design, and renamed the macro- + kernel codes corresponding to 3m2 and 4m1b. + - Renamed bli_mem.c (and its APIs) to bli_memsys.c, and renamed/relocated + bli_mem_macro_defs.h from frame/include to frame/base/bli_mem.h. + - Renamed/relocated bli_auxinfo_macro_defs.h from frame/include to + frame/base/bli_auxinfo.h. + - Fixed a minor bug whereby the storage-to-ukr-preference matching + optimization in the various level-3 front-ends was not being applied + properly when the context indicated that execution would be via an + induced method. (Before, we always checked the native micro-kernel + corresponding to the datatype being executed, whereas now we check + the native micro-kernel corresponding to the datatype's real projection, + since that is the micro-kernel that is actually used by induced methods. + - Added an option to the testsuite to skip the testing of native level-3 + complex implementations. Previously, it was always tested, provided that + the c/z datatypes were enabled. However, some configurations use + reference micro-kernels for complex datatypes, and testing these + implementations can slow down the testsuite considerably. + +commit 73517f522b69de429dd7f3df60a70c068149ab28 +Merge: c6f5c21 50293da +Author: Field G. Van Zee +Date: Tue Aug 23 13:46:59 2016 -0500 + + Merge branch 'master' into compose + +commit 50293da38d5f2b7be9bbc94b9e85aacb6a10f672 +Author: Field G. Van Zee +Date: Tue Aug 23 13:38:36 2016 -0500 + + Avoid compiling BLAS/CBLAS files when disabled. + + Details: + - Updated the top-level Makefile, build/config.mk.in template, and + configure script so that object files corresponding to source files + belonging to the BLAS compatibility layer are not compiled (or archived) + when the compatibility layer is disabled. (Same for CBLAS.) Thanks + to Devin Matthews for suggesting this optimization. + - Slight change to the way configure handles internal variables. Instead + of converting (overwriting) some, such as enable_blas2blis and + enable_cblas, from a "yes" or "no" to a "1" or "0" value, the latter are + now stored in new variables that live alongside the originals (with the + suffix "_01"). This is convenient since some values need to be + sed-substituted into the config.mk.in template, which requires "yes" or + "no", while some need to be written to the bli_config.h.in template, + which requires "0" or "1". + +commit c6f5c215ee793d03ea834469fc2adc53feaffc42 +Merge: d52cb76 16a4c7a +Author: Field G. Van Zee +Date: Mon Aug 22 17:33:02 2016 -0500 + + Merge branch 'master' into compose + +commit 16a4c7a823d60707ed9272f5d36e5c5d54c0ba4b +Author: Field G. Van Zee +Date: Fri Aug 19 11:38:36 2016 -0500 + + Fixed bugs in bli_mutex_init() and friends. + + Details: + - Fixed a couple of bugs that affected OpenMP and POSIX threads + configurations that resulted in compiler errors and warnings due + to type mismatch, and in the case of pthreads, a missing function + argument. The bugs are fairly recent, introduced in a017062. + +commit d52cb7671509592a8078729477b40b60380518a2 +Merge: 95abea4 c31b1e7 +Author: Field G. Van Zee +Date: Wed Jul 27 16:04:55 2016 -0500 + + Merge branch 'master' into compose + +commit c31b1e7b9d659b96433a87e5aecb90e457a104cc +Author: Field G. Van Zee +Date: Wed Jul 27 15:58:07 2016 -0500 + + Relax alignment restrictions for sandybridge ukrs. + + Details: + - Relaxed the base pointer and leading dimension alignment restrictions + in the sandybridge gemm microkernels, allowing the use of vmovups/vmovupd + instead of vmovaps/vmovapd. These change mimic those made to the haswell + microkernels in e0d2fa0 and ee2c139. + - Updated testsuite modules as well as standalone test drivers in 'test' + directory to use DBL_MAX as the initial time candidate. Thanks to Devin + Matthews for suggesting this change. + - Inserted #include "float.h" into bli_system.h (to gain access to DBL_MAX). + - Minor update (vis-a-vis contexts) to driver code in test/3m4m. + +commit 95abea46f86816fddfc9ff0abfa52880801461be +Merge: d0dfe5b a017062 +Author: Field G. Van Zee +Date: Sat Jul 23 15:38:33 2016 -0500 + + Merge branch 'master' into compose + +commit a017062fdf763037da9d971a028bb07d47aa1c8a +Author: Field G. Van Zee +Date: Fri Jul 22 17:02:59 2016 -0500 + + Integrated "memory broker" (membrk_t) abstraction. + + Details: + - Integrated a patch originally authored and submitted by Ricardo Magana + of HP Enterprise. The changeset inserts use of a new object type, membrk_t, + (memory broker) that allows multiple sets of memory pools on, for example, + separate NUMA nodes, each of which has a separate memory space. + - Added membrk field to cntx_t and defined corresponding accessor macros. + - Added membrk field to mem_t object and defined corresponding accessor macros. + - Created new bli_membrk.c file, which contains the new memory broker API, + including: + bli_membrk_init(), bli_membrk_finalize() + bli_membrk_acquire_[mv](), bli_membrk_release(), + bli_membrk_init_pools(), bli_membrk_reinit_pools(), + bli_membrk_finalize_pools(), + bli_membrk_pool_size() + - In bli_mem.c, changed function calls to + bli_mem_init_pools() -> bli_membrk_init() + bli_mem_reinit_pools() -> bli_membrk_reinit() + bli_mem_finalize_pools() -> bli_membrk_finalize() + - In bli_packv_init.c, bli_packm_init.c, changed function calls to: + bli_mem_acquire_[mv]() -> bli_membrk_acquire_[mv]() + bli_mem_release() -> bli_membrk_release() + - Added bli_mutex.c and related files to frame/thread. These files define + abstract mutexes (locks) and corresponding APIs for pthreads, openmp, or + single-threaded execution. This new API is employed within functions + such as bli_membrk_acquire_[mv]() and bli_membrk_release(). + +commit ce59f81108ec9aea918a7e77030da8acfdd397ce +Merge: ff41153 707a2b7 +Author: Field G. Van Zee +Date: Fri Jul 22 14:48:14 2016 -0500 + + Merge pull request #88 from devinamatthews/32bit-dim_t + + Handle 32-bit dim_t in 64-bit microkernels. + +commit 707a2b7faca137cca7cab7b11a12c44ddaf7ad53 +Author: Devin Matthews +Date: Fri Jul 22 13:49:44 2016 -0500 + + Somehow forgot the most important microkernel. + +commit 47ec045056351ac4f0791c071fa0daaa81699c8c +Merge: 08f1d6b ff41153 +Author: Devin Matthews +Date: Fri Jul 22 13:45:23 2016 -0500 + + Merge remote-tracking branch 'upstream/master' into 32bit-dim_t + +commit 08f1d6b6fa344275de0f675f69737145ccf6646a +Author: Devin Matthews +Date: Fri Jul 22 13:44:37 2016 -0500 + + Use 64-bit intermediate variable for k for architectures that do 64-bit loads in case dim_t is 32-bit. + +commit ff41153f4eb7f38ed94bdd9a3fd81fb979f3f401 +Merge: f9214ce e0d2fa0 +Author: Field G. Van Zee +Date: Fri Jul 22 13:21:03 2016 -0500 + + Merge pull request #86 from devinamatthews/haswell-vmovups + + Remove alignment restrictions on C in haswell kernel. + +commit e0d2fa0d835ab49366aeb790363bb2b571d36ed8 +Author: Devin Matthews +Date: Fri Jul 22 12:56:51 2016 -0500 + + Relax alignment restrictions for haswell sgemm. + +commit f9214ced97392861f5a0ea72abfcf6f41faf674c +Merge: 413d62a 08666ea +Author: Field G. Van Zee +Date: Fri Jul 22 12:16:39 2016 -0500 + + Merge pull request #85 from devinamatthews/qopenmp + + Change -openmp to -fopenmp for icc. + +commit ee2c139df6ad53c6aec8a67ab23b3b1912e8d259 +Author: Devin Matthews +Date: Fri Jul 22 12:06:03 2016 -0500 + + Remove alignment restrictions on C in haswell kernel. + +commit 08666eaa20d8a31f2f92f944e5bfa7c1558c53e4 +Author: Devin Matthews +Date: Fri Jul 22 11:07:34 2016 -0500 + + Change -openmp to -fopenmp for icc. + +commit d0dfe5b5372cc7558ee9c4104b29f82eecc7ed61 +Merge: 31def12 413d62a +Author: Field G. Van Zee +Date: Thu Jul 14 11:01:06 2016 -0500 + + Merge branch 'master' into compose + +commit 413d62aca28edabba56605a9f87d5b715831e1db +Author: Field G. Van Zee +Date: Tue Jul 12 15:02:52 2016 -0500 + + README update (use official ACM TOMS links). + +commit dfa431f696db2df4065ea454df268a2e0bc02eac +Author: Field G. Van Zee +Date: Tue Jul 12 14:21:19 2016 -0500 + + README update (BLIS2 TOMS article now in-print). + +commit 31def12e2629f187e40f93f6bae9e26a6c2660e2 +Author: Field G. Van Zee +Date: Thu Jun 30 15:19:20 2016 -0500 + + First phase of control tree redesign. + + Details: + - These changes constitute the first set of changes in preparation to + revamping the structure and use of control trees in BLIS. Modifications + in this commit don't affect the control tree code yet, but rather lay + the groundwork. + - Defined wrappers for the following functions, where the the wrappers + each take a direction parameter of a new enumerated type (BLIS_BWD or + BLIS_FWD), dir_t, and executes the correct underlying function. + - bli_acquire_mpart_*() and _vpart_*() + - bli_*_determine_kc_[fb]() + - bli_thread_get_range_*() and bli_thread_get_range_weighted_*() + - Consolidated all 'f' (forwards-moving) and 'b' (backwards-moving) + blocked variants for trmm and trsm, and renamed gemm and herk variants + accordingly. The direction is now queried via routines such as + bli_trmm_direct(), which deterines the direction from the implied side + and uplo parameters. For gemm and herk, it is uncondtionally BLIS_FWD. + - Defined wrappers to parameter-specific macrokernels for herk, trmm, and + trsm, e.g. bli_trmm_xx_ker_var2(), that execute the correct underlying + macrokernel based on the implied parameters. The same logic used to + choose the dir_t in _direct() functions is used here. + - Simplified the function pointer arrays in _int() functions given the + consolidation and dir_t querying mentioned above. + - Function signature (whitespace) reformatting for various functions. + - Removed old code in various 'old' directories. + +commit 232754feecf29452987666b9f5ebba2619bfd0b0 +Author: Field G. Van Zee +Date: Tue Jun 21 14:25:39 2016 -0500 + + Fixed compiler warning in rand[vm], randn[vm]. + + Details: + - Fixed compiler warnings about unused variables related to the disabling + of normalization in the structured cases of the rand[vm] and randn[vm] + operations. + +commit a89555d1605574f3685813dcc972b636dd61264d +Author: Field G. Van Zee +Date: Fri Jun 17 14:08:35 2016 -0500 + + Added randn[vm] operations, support in testsuite. + + Details: + - Defined a new randomization operation, randn, on vectors and matrices. + The randnv and randnm operations randomize each element of the target + object with values from a narrow range of values. Presently, those + values are all integer powers of two, but they do not need to be powers + of two in order to achieve the primary goal, which is to initialize + objects that can be operated on with plenty of precision "slack" + available to allow computations that avoid roundoff. Using this method + of randomization makes it much more likely that testsuite residuals of + properly-functioning operations are close to zero, if not exactly zero. + - Updated existing randomization operations randv and randm to skip + special diagonal handling and normalization for matrices with structure. + This is now handled by the testsuite modules by explicitly calling a + testsuite function that loads the diagonal (and scales off-diagonal + elements). + - Added support for randnv and randnm in the testsuite with a new switch + in input.general that universally toggles between use of the classic + randv/randm, which use real values on the interval [-1,1], and + randnv/randnm, which use only values from a narrow range. Currently, + the narrow range is: +/-{2^0, 2^-1, 2^-2, 2^-3, 2^-4, 2^-5, 2^-6}, as + well as 0.0. + - Updated testsuite modules so that a testsutie wrapper function is called + instead of directly calling the randomization operations (such as + bli_randv() and bli_randm()). This wrapper also takes a bool_t that + indicates whether the object's elements should be normalized. (NOTE: As + alluded to above, in the test modules of triangular solve operations such + as trsv and trsm, we perform the extra step of loading the diagonal.) + - Defined a new level-0 operation, invertsc, which inverts a scalar. + - Updated the abval2ris and sqrt2ris level-0 macros to avoid an unlikely + but possible divide-by-zero. + - Updated function signature and prototype formatting in testsuite. + +commit 096895c5d538a7f8817603d7cf28c52e99340def +Author: Field G. Van Zee +Date: Mon Jun 6 13:32:04 2016 -0500 + + Reorganized code, APIs related to multithreading. + + Details: + - Reorganized code and renamed files defining APIs related to multithreading. + All code that is not specific to a particular operation is now located in a + new directory: frame/thread. Code is now organized, roughly, by the + namespace to which it belongs (see below). + - Consolidated all operation-specific *_thrinfo_t object types into a single + thrinfo_t object type. Operation-specific level-3 *_thrinfo_t APIs were + also consolidated, leaving bli_l3_thrinfo_*() and bli_packm_thrinfo_*() + functions (aside from a few general purpose bli_thrinfo_*() functions). + - Renamed thread_comm_t object type to thrcomm_t. + - Renamed many of the routines and functions (and macros) for multithreading. + We now have the following API namespaces: + - bli_thrinfo_*(): functions related to thrinfo_t objects + - bli_thrcomm_*(): functions related to thrcomm_t objects. + - bli_thread_*(): general-purpose functions, such as initialization, + finalization, and computing ranges. (For now, some macros, such as + bli_thread_[io]broadcast() and bli_thread_[io]barrier() use the + bli_thread_ namespace prefix, even though bli_thrinfo_ may be more + appropriate.) + - Renamed thread-related macros so that they use a bli_ prefix. + - Renamed control tree-related macros so that they use a bli_ prefix (to be + consistent with the thread-related macros that were also renamed). + - Removed #undef BLIS_SIMD_ALIGN_SIZE from dunnington's bli_kernel.h. This + #undef was a temporary fix to some macro defaults which were being applied + in the wrong order, which was recently fixed. + +commit 232530e88ff99f37abcae5b6fb5319a9a375a45f +Merge: 4bcabd1 eef37f8 +Author: Tyler Michael Smith +Date: Wed Jun 1 15:14:10 2016 -0500 + + Merge commit 'refs/pull/81/head' of https://github.com/flame/blis + + Conflicts: + frame/base/bli_threading_pthreads.c + frame/base/bli_threading_pthreads.h + +commit 4bcabd1bf60688c38cf562459fc5e8be8b831756 +Author: Tyler Michael Smith +Date: Wed Jun 1 13:27:28 2016 -0500 + + Use spin locks instead of pthread barriers + +commit eef37f8b4d81845a6ba4bf25586d32b50c3e8a68 +Author: Jeff Hammond +Date: Sun May 29 22:28:13 2016 -0700 + + use GCC intrinsic instead of pthread_mutex for atomic increment and fetch + +commit 9dcd6f05c4c3ff2ce7cd87a9951a96ebef22681e +Author: Field G. Van Zee +Date: Tue May 24 13:15:32 2016 -0500 + + Implemented developer-configurable malloc()/free(). + + Details: + - Replaced all instances of bli_malloc() and bli_free() with one of: + - bli_malloc_pool()/bli_free_pool() + - bli_malloc_user()/bli_free_user() + - bli_malloc_intl()/bli_free_intl() + each of which can be configured to call malloc()/free() substitutes, + so long as the substitute functions have the same function type + signatures as malloc() and free() defined by C's stdlib.h. The _pool() + function is called when allocating blocks for the memory pools (used + for packing buffers, primarily), the _user() function is called when + obj_t's are created (via bli_obj_create() and friends), and the _intl() + function is called for internal use by BLIS, such as when creating + control tree nodes or temporary buffers for manipulating internal data + structures. Substitutes for any of the three types of bli_malloc() may + be specified by #defining the following pairs of cpp macros in + bli_kernel.h: + - BLIS_MALLOC_POOL/BLIS_FREE_POOL + - BLIS_MALLOC_USER/BLIS_FREE_USER + - BLIS_MALLOC_INTL/BLIS_FREE_INTL + to be the name of the substitute functions. (Obviously, the object + code that contains these functions must be provided at link-time.) + These macros default to malloc() and free(). Subsitute functions are + also automatically prototyped by BLIS (in bli_malloc_prototypes.h). + - Removed definitions for bli_malloc() and bli_free(). + - Note that bli_malloc_pool() and bli_malloc_user() are now defined in + terms of a new function, bli_malloc_align(), which aligns memory to an + arbitrary (power of two) alignment boundary, but does so manually, + whereas before alignment was performed behind the scenes by + posix_memalign(). Currently, bli_malloc_intl() is defined in terms + of bli_malloc_noalign(), which serves as a simple wrapper to the + designated function that is passed in (e.g. BLIS_MALLOC_INTL). + Similarly, there are bli_free_align() and bli_free_noalign(), which + are used in concert with their bli_malloc_*() counterparts. + +commit 9dd440109a9d964f5cd286e9f83c487ad703e1e4 +Author: Jeff Hammond +Date: Sat May 21 15:21:58 2016 -0700 + + fix 404 link to BuildSystem + + Google Code is dead. Long live GitHub! + +commit d309f20b7376a68efa3b864ad790c2021c071655 +Author: Field G. Van Zee +Date: Wed May 18 15:13:53 2016 -0500 + + Added alignment switch to testsuite. + + Details: + - Added a new input parameter to input.general that globally toggles + whether testsuite tests are performed on objects whose buffers and + leading dimensions have been aligned, and changed the implementation + of libblis_test_mobj_create() to employ alignment (or not) regardless + of whether row, column, or general storage is being tested. + - Updated configure script's "--help" text to indicate default behavior + for internal integer type size and BLAS/CBLAS integer type size + options. + +commit 32db0adc218ea4ae370164dbe8d23b41cd3526d3 +Author: Field G. Van Zee +Date: Tue May 17 15:20:16 2016 -0500 + + Generate prototypes for user-defined packm kernels. + + Details: + - Created template prototypes for packm kernels (in bli_l1m_ker.h), and + then redefined reference packm kernels' prototyping headers in terms of + this template, as is already done for level-1v, -1f, and -3 kernels. + - Automatically generate prototypes for user-defined packm kernels in + bli_kernel_prototypes.h (using the new template prototypes in + bli_l1m_ker.h). + - Defined packm kernel function types in bli_l1m_ft.h, including for + packm kernels specific to induced methods, which are now used in + bli_packm_cxk.c and friends rather than using a locally-defined + function type. + - In bli_packm_cxk.c, extended function pointer for packm kernels array + from out to index 31 (from previous maximum of 17). This allows us to + store the unrolled 30xk kernel in the array for use (on knc, for + example). Note: This should have been done a long time ago. + +commit 4bcf1b35abea3f3dfc8f2fe462dcf155cf199e55 +Author: Field G. Van Zee +Date: Wed May 11 16:09:49 2016 -0500 + + Fixed bli_get_range_*() bugs in trsm variants. + + Details: + - Fixed incorrect calls to bli_get_range_*() from within trsm blocked + variants 1f, 2b, and 2f. The bug somehow went undetected since the + big commit (537a1f4), and, strangely, did not manifest via the BLIS + testsuite. The bug finally came to our attention when running thei + libflame test suite while linking to BLIS. Thanks to Kiran Varaganti + for submitting the initial report that led to this bug. + +commit 9cfa33023f123a6c17e987f72fba174ce073f0b6 +Author: Field G. Van Zee +Date: Wed May 11 16:02:30 2016 -0500 + + Minor updates to bli_f2c.h. + + Details: + - Added #undef guards to certain #define statements in bli_f2c.h, + and renamed the file guard to BLIS_F2C_H. This helps when + #including "blis.h" from an application or library that already + #includes an "f2c.h" header. + +commit a09a2e23eacf5328858c8318bb637c5ff3b71d08 +Merge: 4dcd37e 7c604e1 +Author: Tyler Michael Smith +Date: Wed May 11 10:47:11 2016 -0500 + + Merge pull request #76 from devinamatthews/move_simd_defs + + Move default SIMD-related definitions to bli_kernel_macro_defs.h + +commit 4dcd37eb1b12a6e08cc13df7b61391ef8363f5d8 +Author: Tyler Smith +Date: Tue May 10 16:28:59 2016 -0500 + + fixing knc simd align size + +commit 7c604e1cbc1609b6e12d3ee973c08b7af5035be4 +Author: Devin Matthews +Date: Tue May 10 12:11:55 2016 -0500 + + Move default SIMD-related definitions to bli_kernel_macro_defs.h. Otherwise, configurations which customize these fail as these are now defined in bli_kernel.h. + +commit a7be2d28e8930b154d0da1d6929b54a96e210af6 +Merge: 97b512e 4b1e55e +Author: Field G. Van Zee +Date: Tue May 10 11:48:51 2016 -0500 + + Merge pull request #74 from devinamatthews/fix_common_symbols + + Default-initialize all extern global variables to avoid generating common symbols. + +commit 4b1e55edbfe0e1cb2e7b9428424903497cb7a841 +Author: Devin Matthews +Date: Tue May 10 10:08:47 2016 -0500 + + Default-initialize all extern global variables to avoid generating common symbols. Fixes #73. + +commit 97b512ef62c7e25c97ed5e9eca81cd7015b2ac91 +Author: Field G. Van Zee +Date: Fri May 6 10:24:30 2016 -0500 + + Include headers from cblas.h to pull in f77_int. + + Details: + - Added #include statements for certain key BLIS headers so that the + definition of f77_int is pulled in when a user compiles application + code with only #include "cblas.h" (and no other BLIS header). This + is necessary since f77_int is now used within the cblas API. + +commit c3a4d39d03665135f1616588b5ef7c3e9ef5688d +Author: Field G. Van Zee +Date: Wed May 4 17:22:56 2016 -0500 + + Updates to haswell gemm micro-kernels. + + Details: + - Added two new sets of [sd]gemm micro-kernels for haswell architectures, + one that is 4x24/4x12 (s and d) and one that is 6x16/6x8. + - Changed the haswell configuration to use the 6x16/6x8 micro-kernels + by default. + - Updated various Makefiles, in test, test/3m4m, and testsuite. + +commit 0b01d355ae861754ae2da6c9a545474af010f02e +Author: Field G. Van Zee +Date: Wed Apr 27 15:21:10 2016 -0500 + + Miscellaneous cleanups, fixes to recent commits. + + Details: + - Fixed a typo in bli_l1f_ref.h, introduced into bbb8569, that only + manifested when non-reference level-1f kernels were used. + - Added an #undef BLIS_SIMD_ALIGN_SIZE to bli_kernel.h of dunnington + configuration to prevent a compile-time warning until I can figure out + the proper permanent fix. + - Moved frame/1f/kernels/bli_dotxaxpyf_ref_var1.c out of the compilation + path (into 'other' directory). _ref_var2 is used by default, which is + the variant that is built on axpyf and dotxf instead of dotaxpyv. + - Removed section of frame/include/bli_config_macro_defs.h pertaining to + mixed datatype support. + +commit ed7326c836f427e2f8420b015220ce293207b10c +Author: Field G. Van Zee +Date: Wed Apr 27 14:57:40 2016 -0500 + + Added 'restrict' to l1v/l1f code in 'kernels' dir. + + Details: + - Added 'restrict' keyword to existing kernel definitions in 'kernels' + directory. These changes were meant for inclusion in bbb8569. + +commit bbb8569b2a08c3bcd631d5a05eb389d01d94ac07 +Author: Field G. Van Zee +Date: Wed Apr 27 14:13:46 2016 -0500 + + Use 'restrict' in all kernel APIs; wspace changes. + + Details: + - Updated level-1v, level-1f kernel function types (bli_l1?_ft.h) and + generic kernel prototypes (bli_l1?_ker.h) to use 'restrict' for all + numerical operand pointers (ie: all pointers except the cntx_t). + - Updated level-1f reference kernel definitions to use 'restrict' for + all numerical operand pointers. (Level-1v reference kernel definitions + were already updated in bdbda6e.) + - Rewrote the level-1v and level-1f reference kernel prototypes in + bli_l1v_ref.h and bli_l1f_ref.h, respectively, to simply #include + bli_l1v_ker.h and bli_l1f_ker.h with redefined function base names + (as was already being done for the level-3 micro-kernel prototypes + in bli_l3_ref.h), rather than duplicate the signatures from the + _ker.h files. + - Added definitions to frame/include/bli_kernel_prototypes.h for axpbyv + and xpbyv, which were probably meant for inclusion in bdbda6e. + - Converted a number of instances of four spaces, as introduced in + bdbda6e, to tabs. + +commit 4ea419c72c789825e1f93a1eee88219bbf873930 +Merge: f1e9be2 bdbda6e +Author: Field G. Van Zee +Date: Tue Apr 26 12:50:45 2016 -0500 + + Merge pull request #70 from devinamatthews/daxpby + + Give the level1v operations some love + +commit bdbda6e6acc682ab1b6ca680edebd09ae12a832c +Author: Devin Matthews +Date: Mon Apr 25 11:05:57 2016 -0500 + + Give the level1v operations some love: + + - Add missing axpby and xpby operations (plus test cases). + - Add special case for scal2v with alpha=1. + - Add restrict qualifiers. + - Add special-case algorithms for incx=incy=1. + +commit f1e9be2aba1a057eedb947bbae96848597777408 +Author: Field G. Van Zee +Date: Fri Apr 22 15:34:02 2016 -0500 + + Minor tweak to test/Makefile. + + Details: + - Just committing a minor change to test/Makefile that has been lingering + in my local working copy for longer than I can remember. + +commit aa0bceec277938328dabeb744680623f24fb0b61 +Merge: 4136553 e2784b4 +Author: Field G. Van Zee +Date: Fri Apr 22 12:01:31 2016 -0500 + + Merge branch 'master' of github.com:flame/blis + +commit 4136553f0d0661a668dfdb9edcd7ce1c5773dde7 +Author: Field G. Van Zee +Date: Fri Apr 22 11:53:53 2016 -0500 + + Clear level-3 cntx_t's via memset() before use. + + Details: + - In all level-3 operations' _cntx_init() functions, replaced calls to + bli_cntx_obj_init() with calls to bli_cntx_obj_clear(), and in all + level-3 operations' _cntx_finalize() functions, removed calls to + bli_cntx_obj_finalize(), leaving those function definitions empty. + - Changed the definition of bli_cntx_obj_clear() so that the clearing + occurs via a single call to memset(). + +commit e2784b4c921f706e756df3e146e20a4cb63f53e3 +Merge: dd0ab1d a9b6c3a +Author: Field G. Van Zee +Date: Wed Apr 20 18:34:09 2016 -0500 + + Merge pull request #67 from devinamatthews/cblas-f77-int + + Change CBLAS integer type to f77_int + +commit a9b6c3abda6222a8b240361643932e83cf726c4f +Merge: e4c54c8 dd0ab1d +Author: Devin Matthews +Date: Wed Apr 20 16:00:10 2016 -0500 + + Merge remote-tracking branch 'origin/master' into cblas-f77-int + + # Conflicts: + # config/haswell/bli_config.h + +commit e4c54c81463c2a19c9bb6b1f0f1be3fa9d018a45 +Author: Devin Matthews +Date: Wed Apr 20 15:56:46 2016 -0500 + + Change integer type in CBLAS function signatures to f77_int, and add proper const-correctness to BLAS layer. + +commit dd0ab1d93f33abca6af9edd7b8e52da62dcfa5b1 +Author: Field G. Van Zee +Date: Wed Apr 20 14:38:23 2016 -0500 + + Converted some bli_cntx query functions to macros. + + Details: + - Commented out several datatype-aware query functions (those ending in + _dt) from bli_cntx.c, as well as their prototypes in bli_cntx.h, and + added equivalent cpp query macros to bli_cntx.h. + - Added 'bli_config.h' to .gitignore. + +commit a30ccbc4c6a6e6460e78af6b5c530ee0d06f98fb +Merge: eb2f18e 0e1a982 +Author: Field G. Van Zee +Date: Tue Apr 19 15:04:33 2016 -0500 + + Merge pull request #66 from devinamatthews/blas-configure + + Add configure options and generate bli_config.h automatically. + +commit eb2f18e4844d985715df20798f50f9cc12e3b5ad +Author: Field G. Van Zee +Date: Tue Apr 19 12:50:32 2016 -0500 + + More compile-time fixes to bgq gemm ukernel code. + +commit 0e1a9821d860f6c1d818baf4c48d21a23726c132 +Author: Devin Matthews +Date: Tue Apr 19 11:44:37 2016 -0500 + + Add configure options and generate bli_config.h automatically. + + Options to configure have been added for: + - Setting the internal BLIS and BLAS/CBLAS integer sizes. + - Enabling and disabling the BLAS and CBLAS layers. + + Additionally, configure options which require defining macros (the above plus the threading model), write their macros to the automatically-generated bli_config.h file in the top-level build directory. The old bli_config.h files in the config dirs were removed, and any kernel-related macros (SIMD size and alignment etc.) were moved to bli_kernel.h. The Makefiles were also modified to find the new bli_config.h file. + + Lastly, support for OMP in clang has been added (closes #56). + +commit ff84469a4575f1ef8a0010046fde52240a312cae +Author: Field G. Van Zee +Date: Mon Apr 18 12:29:09 2016 -0500 + + Applied various compilation fixes to bgq kernels. + +commit cbcd0b739dc54bd14fbb46aeda267c26725cd70f +Author: Tyler Michael Smith +Date: Mon Apr 18 03:12:57 2016 -0500 + + Changing ifdef for OSX pthread barriers + +commit dd62080cea78f3a23616200d6640e52c102b2bb9 +Author: Field G. Van Zee +Date: Fri Apr 15 11:15:41 2016 -0500 + + Compile-time fix to bgq l1f kernels. + + Details: + - Fixed an old reference to bli_daxpyf_fusefac, which no longer exists, + by replacing it with the axpyf fusing factor (8), and cleaned up the + relevant section of config/bgq/bli_kernel.h. + - Removed most of the details of the level-3 kernels from the template + kernel code in config/template/kernels/3 and replaced it with a + reference to the relevant kernel wiki maintained on the BLIS github + website. + +commit d5a915dd8d7a6ead42a68772e4420eb3647e6f1a +Merge: 4320b72 4169467 +Author: Field G. Van Zee +Date: Thu Apr 14 12:56:36 2016 -0500 + + Merge branch 'master' of github.com:flame/blis + +commit 4320b725a1f8fd34101470b6cf52ad504a79c517 +Author: Field G. Van Zee +Date: Thu Apr 14 12:51:29 2016 -0500 + + Use kernel CFLAGS on "ukernels" directories. + + Details: + - Updated the top-level Makefile so that the CFLAGS variable designated + for kernel source code is applied not only to source code in + directories named "kernels" but source code in any directory that + contains the substring "kernels", such as "ukernels". + - Formally disabled some code in gen-make-frag.sh script that was already + effectively disabled. The code was related to handling "noopt" and + "kernel" directories, which is now handled independently within the + top-level Makefile without needing to place these source files into + a spearate makefile variable. + +commit 41694675e4cb56e2e0323c7a7db48e0819606a31 +Author: Tyler Smith +Date: Wed Apr 13 15:51:08 2016 -0500 + + pthreads bugfixes + + Getting pthreads to work on my Mac + Implemented a pthread barrier when _POSIX_BARRIER isn't defined + Now spawn n-1 threads instead of n threads so that master thread isn't just spinning the whole time + Add -lpthread instead of -pthread to LDFLAGS (for clang) + +commit f756dbfa0d542cbc497724981520c83abf049c4b +Author: Field G. Van Zee +Date: Wed Apr 13 11:25:33 2016 -0500 + + Removed stale #include from bgq configuration. + + Details: + - Removed an old #include statement ("bli_gemm_8x8.h") from the + bli_kernel.h file in the bgq configuration. It turns out this + file was no longer needed even prior to 537a1f4. + +commit 0bd4169ea75f690714e7d2912229932a75d8a7e2 +Author: Field G. Van Zee +Date: Mon Apr 11 18:08:32 2016 -0500 + + Fixed context-broken dunnington/penryn kernels. + + Details: + - Added missing context parameters to several instances where simpler + kernels, or reference kernels, are called instead of executing the + main body code contained in the kernel function in question. + - Renamed axpyv and dotv kernel files to use "opt" instead of "int" + substring, for consistency with level-1f kernels. + +commit 7912af5db45b7372d19a9a3dfeb82df302a05628 +Author: Field G. Van Zee +Date: Mon Apr 11 17:32:13 2016 -0500 + + CHANGELOG update (0.2.0) + +commit 898614a555ea0aa7de4ca07bb3cb8f5708b6a002 (tag: 0.2.0) Author: Field G. Van Zee Date: Mon Apr 11 17:32:09 2016 -0500 @@ -132,7 +1182,7 @@ Date: Mon Apr 11 17:21:28 2016 -0500 that this does not preclude supporting mixed types via the object APIs, where it produces absolutely zero API code bloat. -commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 (origin/master) +commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 Merge: 20af937 c11d28e Author: Field G. Van Zee Date: Tue Apr 5 12:21:27 2016 -0500 @@ -2384,8 +3434,8 @@ Date: Wed Aug 20 14:44:51 2014 -0500 Merge branch 'master' of http://github.com/flame/blis Conflicts: - frame/3/trsm/bli_trsm_blk_var2b.c - frame/3/trsm/bli_trsm_blk_var2f.c + frame/3/trsm/bli_trsm_blk_var2b.c + frame/3/trsm/bli_trsm_blk_var2f.c commit 699a8151ca3d5021e834a1784ef45dcc3a3d17cd Author: Tyler Smith @@ -3492,8 +4542,8 @@ Date: Fri Apr 4 09:54:54 2014 -0500 Merge http://github.com/flame/blis Conflicts: - kernels/bgq/1/bli_axpyv_opt_var1.c - kernels/bgq/1/bli_dotv_opt_var1.c + kernels/bgq/1/bli_axpyv_opt_var1.c + kernels/bgq/1/bli_dotv_opt_var1.c commit 4e3eb39aca4df0b9fdc003d468f368a2f2ba597d Author: Tyler Michael Smith @@ -3793,7 +4843,7 @@ Date: Thu Feb 27 16:46:23 2014 -0600 Merge https://github.com/flame/blis Conflicts: - frame/1m/packm/bli_packm_blk_var1.c + frame/1m/packm/bli_packm_blk_var1.c commit e8757b03a74f9891632242e9a90efb32150826f5 Author: Field G. Van Zee From 0b571cd94d9b175331c9453258a6b1389a718ae8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 6 Oct 2016 14:48:15 -0500 Subject: [PATCH 18/27] Fixed segfault in bli_free_align() for NULL ptrs. Details: - Fixed a bug in bli_free_align() caused by failing to handle NULL pointers up-front, which led to performing pointer arithmetic on NULL pointers in order to free the address immediately before the pointer. Thanks to Devin Matthews for reporting this bug. --- frame/base/bli_malloc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frame/base/bli_malloc.c b/frame/base/bli_malloc.c index 191db4834..3a36378ae 100644 --- a/frame/base/bli_malloc.c +++ b/frame/base/bli_malloc.c @@ -145,6 +145,10 @@ void bli_free_align int8_t* p_byte; void** p_addr; + // If the pointer to free is NULL, it was obviously not aligned and + // does not need to be freed. + if ( p == NULL ) return; + // Since the bli_malloc_pool() function returned the aligned pointer, // we have to first recover the original pointer before we can free // the memory. From 22377abd84b9e560ffe1c4e4d284eb443ddb7133 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 10 Oct 2016 13:43:56 -0500 Subject: [PATCH 19/27] Fixed bli_gemm() segfault on empty C matrices. Details: - Fixed a bug that would manifest in the form of a segmentation fault in bli_cntl_free() when calling any level-3 operation on an empty output matrix (ie: m = n = 0). Specifically, the code previously assumed that the entire control tree was built prior to it being freed. However, if the level-3 operation performs an early exit, the control tree will be incomplete, and this scenario is now handled. Thanks to Elmar Peise for reporting this bug. --- frame/base/bli_cntl.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index 3b39befe4..2b45a5de3 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -107,9 +107,13 @@ void bli_cntl_free thrinfo_t* thread_sub_node = bli_thrinfo_sub_node( thread ); - // Recursively free all memory associated with the sub-node and its - // children. - bli_cntl_free( cntl_sub_node, thread_sub_node ); + // Only recurse if the current thrinfo_t node has a child. + if ( thread_sub_node != NULL ) + { + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free( cntl_sub_node, thread_sub_node ); + } // Free the current node's params field, if it is non-NULL. if ( cntl_params != NULL ) From 9cda6057eaa16a24ac8785a9fa167df6c9edba44 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 11 Oct 2016 13:21:26 -0500 Subject: [PATCH 20/27] Removed previously renamed/old files. Details: - Removed frame/base/bli_mem.c and frame/include/bli_auxinfo_macro_defs.h, both of which were renamed/removed in 701b9aa. For some reason, these files survived when the compose branch was merged back into master. (Clearly, git's merging algorithm is not perfect.) - Removed frame/base/bli_mem.c.prev (an artifact of the long-ago changed memory allocator that I was keeping around for no particular reason). --- frame/base/bli_mem.c | 203 -------------- frame/base/bli_mem.c.prev | 366 ------------------------- frame/include/bli_auxinfo_macro_defs.h | 70 ----- 3 files changed, 639 deletions(-) delete mode 100644 frame/base/bli_mem.c delete mode 100644 frame/base/bli_mem.c.prev delete mode 100644 frame/include/bli_auxinfo_macro_defs.h diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c deleted file mode 100644 index 83b936aae..000000000 --- a/frame/base/bli_mem.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2016 Hewlett Packard Enterprise Development LP - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_PTHREADS -pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; -#endif - -static membrk_t global_membrk; - -// ----------------------------------------------------------------------------- - -membrk_t* bli_mem_global_membrk( void ) -{ - return &global_membrk; -} - -siz_t bli_mem_pool_size( packbuf_t buf_type ) -{ - siz_t r_val; - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // We don't (yet) track the amount of general-purpose - // memory that is currently allocated. - r_val = 0; - } - else - { - dim_t pool_index; - pool_t* pool; - - // Acquire the pointer to the pool corresponding to the buf_type - // provided. - pool_index = bli_packbuf_index( buf_type ); - pool = bli_membrk_pool( pool_index, &global_membrk ); - - // Compute the pool "size" as the product of the block size - // and the number of blocks in the pool. - r_val = bli_pool_block_size( pool ) * - bli_pool_num_blocks( pool ); - } - - return r_val; -} - -// ----------------------------------------------------------------------------- - -static bool_t bli_mem_is_init = FALSE; - -void bli_mem_init( void ) -{ - cntx_t cntx; - - // If the initialization flag is TRUE, we know the API is already - // initialized, so we can return early. - if ( bli_mem_is_init == TRUE ) return; - - // Create and initialize a context for gemm so we have something - // to pass into bli_mem_init_pools(). - bli_gemm_cntx_init( &cntx ); - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // Here, we test the initialization flag again. NOTE: THIS IS NOT - // REDUNDANT. This additional test is needed so that other threads - // that may be waiting to acquire the lock do not perform any - // initialization actions once they are finally allowed into this - // critical section. - if ( bli_mem_is_init == FALSE ) - { - // Initialize the global membrk_t object and its memory pools. - bli_membrk_init( &cntx, &global_membrk ); - - // After initialization, mark the API as initialized. - bli_mem_is_init = TRUE; - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - - // Finalize the temporary gemm context. - bli_gemm_cntx_finalize( &cntx ); -} - -void bli_mem_reinit( cntx_t* cntx ) -{ -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // If for some reason the memory pools have not yet been - // initialized (unlikely), we emulate the body of bli_mem_init(). - if ( bli_mem_is_init == FALSE ) - { - // Initialize the global membrk_t object and its memory pools. - bli_membrk_init( cntx, &global_membrk ); - - // After initialization, mark the API as initialized. - bli_mem_is_init = TRUE; - } - else - { - // Reinitialize the global membrk_t object's memory pools. - bli_membrk_reinit_pools( cntx, &global_membrk ); - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif -} - -void bli_mem_finalize( void ) -{ - // If the initialization flag is FALSE, we know the API is already - // uninitialized, so we can return early. - if ( bli_mem_is_init == FALSE ) return; - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // Here, we test the initialization flag again. NOTE: THIS IS NOT - // REDUNDANT. This additional test is needed so that other threads - // that may be waiting to acquire the lock do not perform any - // finalization actions once they are finally allowed into this - // critical section. - if ( bli_mem_is_init == TRUE ) - { - // Finalize the global membrk_t object and its memory pools. - bli_membrk_finalize( &global_membrk ); - - // After finalization, mark the API as uninitialized. - bli_mem_is_init = FALSE; - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif -} - -bool_t bli_mem_is_initialized( void ) -{ - return bli_mem_is_init; -} - diff --git a/frame/base/bli_mem.c.prev b/frame/base/bli_mem.c.prev deleted file mode 100644 index 7a16e8732..000000000 --- a/frame/base/bli_mem.c.prev +++ /dev/null @@ -1,366 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_PTHREADS -extern pthread_mutex_t mem_manager_mutex; -#endif - -// Declare one memory pool structure for each block size/shape we want to -// be able to allocate. - -static pool_t pools[3]; - - -// Physically contiguous memory for each pool. -// -// Generally speaking, the pool sizes are computed in a sub-header of blis.h -// as follows: -// -// BLIS_MK_POOL_SIZE = BLIS_MAXIMUM_MC_? * BLIS_MAXIMUM_KC_? * BLIS_SIZEOF_? -// -// where "?" is the datatype that results in the largest pool size. The -// constants BLIS_KN_POOL_SIZE and BLIS_MN_POOL_SIZE are computed in a -// similar manner. All constants are computed with appropriate "padding" -// to ensure enough space given the alignments required by bli_config.h. -// - -static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ]; -static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ]; -static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ]; - -#define BLIS_USE_HEAP - -#ifdef BLIS_USE_HEAP -static char* pool_mk_mem = NULL; -static char* pool_kn_mem = NULL; -static char* pool_mn_mem = NULL; -#else -static char pool_mk_mem[ BLIS_MK_POOL_SIZE ]; -static char pool_kn_mem[ BLIS_KN_POOL_SIZE ]; -static char pool_mn_mem[ BLIS_MN_POOL_SIZE ]; -#endif - - - -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ) -{ - siz_t block_size; - dim_t pool_index; - pool_t* pool; - void** block_ptrs; - void* block; - gint_t i; - - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // For general-use buffer requests, such as those used by level-2 - // operations, using bli_malloc() is sufficient, since using - // physically contiguous memory is not as important there. - block = bli_malloc( req_size ); - - // Initialize the mem_t object with: - // - the address of the memory block, - // - the buffer type (a packbuf_t value), and - // - the size of the requested region. - // NOTE: We do not initialize the pool field since this block did not - // come from a contiguous memory pool. - bli_mem_set_buffer( block, mem ); - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_size( req_size, mem ); - } - else - { - // This branch handles cases where the memory block needs to come - // from one of the contiguous memory pools. - - // Map the requested packed buffer type to a zero-based index, which - // we then use to select the corresponding memory pool. - pool_index = bli_packbuf_index( buf_type ); - pool = &pools[ pool_index ]; - - // Unconditionally perform error checking on the memory pool. - { - err_t e_val; - - // Make sure that the requested matrix size fits inside of a block - // of the corresponding pool. - e_val = bli_check_requested_block_size_for_pool( req_size, pool ); - bli_check_error_code( e_val ); - - // Make sure that the pool contains at least one block to check out - // to the thread. - e_val = bli_check_if_exhausted_pool( pool ); - bli_check_error_code( e_val ); - } - - // Access the block pointer array from the memory pool data structure. - block_ptrs = bli_pool_block_ptrs( pool ); - - - // BEGIN CRITICAL SECTION -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - { - - // Query the index of the contiguous memory block that resides at the - // "top" of the pool. - i = bli_pool_top_index( pool ); - - // Extract the address of the top block from the block pointer array. - block = block_ptrs[i]; - - // Clear the entry from the block pointer array. (This is actually not - // necessary.) - //block_ptrs[i] = NULL; - - // Decrement the top of the memory pool. - bli_pool_dec_top_index( pool ); - - - // END CRITICAL SECTION - } -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - - // Query the size of the blocks in the pool so we can store it in the - // mem_t object. - block_size = bli_pool_block_size( pool ); - - // Initialize the mem_t object with: - // - the address of the memory block, - // - the buffer type (a packbuf_t value), - // - the address of the memory pool to which it belongs, and - // - the size of the contiguous memory block (NOT the size of the - // requested region). - bli_mem_set_buffer( block, mem ); - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_pool( pool, mem ); - bli_mem_set_size( block_size, mem ); - } -} - - -void bli_mem_release( mem_t* mem ) -{ - packbuf_t buf_type; - pool_t* pool; - void** block_ptrs; - void* block; - gint_t i; - - // Extract the address of the memory block we are trying to - // release. - block = bli_mem_buffer( mem ); - - // Extract the buffer type so we know what kind of memory was allocated. - buf_type = bli_mem_buf_type( mem ); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // For general-use buffers, we allocate with bli_malloc(), and so - // here we need to call bli_free(). - bli_free( block ); - } - else - { - // This branch handles cases where the memory block came from one - // of the contiguous memory pools. - - // Extract the pool from which the block was allocated. - pool = bli_mem_pool( mem ); - - // Extract the block pointer array associated with the pool. - block_ptrs = bli_pool_block_ptrs( pool ); - - - // BEGIN CRITICAL SECTION -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - { - - // Increment the top of the memory pool. - bli_pool_inc_top_index( pool ); - - // Query the newly incremented top index. - i = bli_pool_top_index( pool ); - - // Place the address of the block back onto the top of the memory pool. - block_ptrs[i] = block; - - - // END CRITICAL SECTION - } -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - } - - - // Clear the mem_t object so that it appears unallocated. We clear: - // - the buffer field, - // - the pool field, and - // - the size field. - // NOTE: We do not clear the buf_type field since there is no - // "uninitialized" value for packbuf_t. - bli_mem_set_buffer( NULL, mem ); - bli_mem_set_pool( NULL, mem ); - bli_mem_set_size( 0, mem ); -} - - -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ) -{ - bli_mem_acquire_m( req_size, - BLIS_BUFFER_FOR_GEN_USE, - mem ); -} - - - -void bli_mem_init() -{ - dim_t index_a; - dim_t index_b; - dim_t index_c; - -#ifdef BLIS_USE_HEAP - pool_mk_mem = bli_malloc( BLIS_MK_POOL_SIZE ); - pool_kn_mem = bli_malloc( BLIS_KN_POOL_SIZE ); - pool_mn_mem = bli_malloc( BLIS_MN_POOL_SIZE ); -#endif - - // Map each of the packbuf_t values to an index starting at zero. - index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - // Initialize contiguous memory pool for MC x KC blocks. - bli_mem_init_pool( pool_mk_mem, - BLIS_MK_BLOCK_SIZE, - BLIS_NUM_MC_X_KC_BLOCKS, - pool_mk_blk_ptrs, - &pools[ index_a ] ); - - // Initialize contiguous memory pool for KC x NC blocks. - bli_mem_init_pool( pool_kn_mem, - BLIS_KN_BLOCK_SIZE, - BLIS_NUM_KC_X_NC_BLOCKS, - pool_kn_blk_ptrs, - &pools[ index_b ] ); - - // Initialize contiguous memory pool for MC x NC blocks. - bli_mem_init_pool( pool_mn_mem, - BLIS_MN_BLOCK_SIZE, - BLIS_NUM_MC_X_NC_BLOCKS, - pool_mn_blk_ptrs, - &pools[ index_c ] ); -} - - -void bli_mem_init_pool( char* pool_mem, - siz_t block_size, - dim_t num_blocks, - void** block_ptrs, - pool_t* pool ) -{ - const siz_t align_size = BLIS_CONTIG_ADDR_ALIGN_SIZE; - dim_t i; - - // If the pool starting address is not already aligned, advance it - // accordingly. - if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) - { - // Notice that this works even if the alignment is not a power of two. - pool_mem += ( ( uintptr_t )align_size - - ( ( uintptr_t )pool_mem % align_size ) ); - } - - // Step through the memory pool, beginning with the aligned address - // determined above, assigning pointers to the beginning of each block_size - // bytes to the ith element of the block_ptrs array. - for ( i = 0; i < num_blocks; ++i ) - { - // Save the address of pool, which is guaranteed to be aligned. - block_ptrs[i] = pool_mem; - - // Advance pool by one block. - pool_mem += block_size; - - // Advance pool a bit further if needed in order to get to the - // beginning of an alignment boundary. - if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) - { - pool_mem += ( ( uintptr_t )align_size - - ( ( uintptr_t )pool_mem % align_size ) ); - } - } - - // Now that we have initialized the array of pointers to the individual - // blocks in the pool, we initialize a pool_t data structure so that we - // can easily manage this pool. - bli_pool_init( num_blocks, - block_size, - block_ptrs, - pool ); -} - - - -void bli_mem_finalize() -{ - // Nothing to do. - -#ifdef BLIS_USE_HEAP - bli_free( pool_mk_mem ); - bli_free( pool_kn_mem ); - bli_free( pool_mn_mem ); -#endif - -} - diff --git a/frame/include/bli_auxinfo_macro_defs.h b/frame/include/bli_auxinfo_macro_defs.h deleted file mode 100644 index aee1869a0..000000000 --- a/frame/include/bli_auxinfo_macro_defs.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_AUXINFO_MACRO_DEFS_H -#define BLIS_AUXINFO_MACRO_DEFS_H - - -// auxinfo_t field query - -#define bli_auxinfo_schema_a( auxinfo ) ( (auxinfo)->schema_a ) -#define bli_auxinfo_schema_b( auxinfo ) ( (auxinfo)->schema_b ) - -#define bli_auxinfo_next_a( auxinfo ) ( (auxinfo)->a_next ) -#define bli_auxinfo_next_b( auxinfo ) ( (auxinfo)->b_next ) - -#define bli_auxinfo_is_a( auxinfo ) ( (auxinfo)->is_a ) -#define bli_auxinfo_is_b( auxinfo ) ( (auxinfo)->is_b ) - - -// auxinfo_t field modification - -#define bli_auxinfo_set_schema_a( schema, auxinfo ) { (auxinfo).schema_a = schema; } -#define bli_auxinfo_set_schema_b( schema, auxinfo ) { (auxinfo).schema_b = schema; } - -#define bli_auxinfo_set_next_a( a_p, auxinfo ) { (auxinfo).a_next = a_p; } -#define bli_auxinfo_set_next_b( b_p, auxinfo ) { (auxinfo).b_next = b_p; } - -#define bli_auxinfo_set_next_ab( a_p, b_p, auxinfo ) \ -{ \ - bli_auxinfo_set_next_a( a_p, auxinfo ); \ - bli_auxinfo_set_next_b( b_p, auxinfo ); \ -} - -#define bli_auxinfo_set_is_a( is, auxinfo ) { (auxinfo).is_a = is; } -#define bli_auxinfo_set_is_b( is, auxinfo ) { (auxinfo).is_b = is; } - - -#endif - From 11eed3f683d09e65f721567b346b0f733bff9a64 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 13 Oct 2016 14:23:23 -0500 Subject: [PATCH 21/27] Fixed a configure -t omp/openmp bug from fd04869. Details: - Forgot to update certain occurrences of "omp" in common.mk during commit fd04869, which changed the preferred configure option string for enabling OpenMP from "omp" to "openmp". --- common.mk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common.mk b/common.mk index 458cdcc03..683d0b0e9 100644 --- a/common.mk +++ b/common.mk @@ -153,9 +153,9 @@ endif ifeq ($(CC_VENDOR),gcc) ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := omp +THREADING_MODEL := openmp endif -ifeq ($(THREADING_MODEL),omp) +ifeq ($(THREADING_MODEL),openmp) CTHREADFLAGS := -fopenmp LDFLAGS += -fopenmp endif @@ -167,9 +167,9 @@ endif ifeq ($(CC_VENDOR),icc) ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := omp +THREADING_MODEL := openmp endif -ifeq ($(THREADING_MODEL),omp) +ifeq ($(THREADING_MODEL),openmp) CTHREADFLAGS := -fopenmp LDFLAGS += -fopenmp endif @@ -183,7 +183,7 @@ ifeq ($(CC_VENDOR),clang) ifeq ($(THREADING_MODEL),auto) THREADING_MODEL := pthreads endif -ifeq ($(THREADING_MODEL),omp) +ifeq ($(THREADING_MODEL),openmp) CTHREADFLAGS := -fopenmp LDFLAGS += -fopenmp endif From 28b2af8a71133ce68774e153b6e05afb05affba8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 13 Oct 2016 14:50:08 -0500 Subject: [PATCH 22/27] Added disabled code to print thrinfo_t structures. Details: - Added cpp-guarded code to bli_thrcomm_openmp.c that allows a curious developer to print the contents of the thrinfo_t structures of each thread, for verification purposes or just to study the way thread information and communicators are used in BLIS. - Enabled some previously-disabled code in bli_l3_thrinfo.c for freeing an array of thrinfo_t* values that is used in the new, cpp-guarde code mentioned above. - Removed some old commented lines from bli_gemm_front.c. --- frame/3/bli_l3_thrinfo.c | 2 +- frame/3/bli_l3_thrinfo.h | 2 +- frame/3/gemm/bli_gemm_front.c | 12 ------------ frame/thread/bli_thrcomm_openmp.c | 17 +++++++++++++++++ 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 78b2b775c..33027a1e8 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -422,6 +422,7 @@ exit(1); return paths; } +#endif void bli_l3_thrinfo_free_paths ( @@ -436,5 +437,4 @@ void bli_l3_thrinfo_free_paths bli_free_intl( threads ); } -#endif diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 71dea7645..fcf1f507d 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -121,10 +121,10 @@ thrinfo_t** bli_l3_thrinfo_create_full_paths ( cntx_t* cntx ); +#endif void bli_l3_thrinfo_free_paths ( thrinfo_t** threads ); -#endif diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 324655655..533a6dcaf 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -88,13 +88,6 @@ void bli_gemm_front // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx ); - // Create the first node in the thrinfo_t tree for each thread. -//thrinfo_t** infos = bli_l3_thrinfo_create_full_paths( cntx ); -//bli_l3_thrinfo_print_paths( infos ); -//exit(1); -//cntl = bli_gemm_cntl_create( BLIS_GEMM ); - //thrinfo_t** infos = bli_l3_thrinfo_create_roots( cntx, cntl ); - // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( @@ -107,10 +100,5 @@ void bli_gemm_front cntx, cntl ); -//bli_l3_thrinfo_print_paths( infos ); -//exit(1); - - // Free the thrinfo_t structures. - //bli_l3_thrinfo_free_paths( infos ); } diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 68d9d7a29..0882d1659 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -199,6 +199,8 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) #endif +//#define PRINT_THRINFO + void bli_l3_thread_decorator ( l3int_t func, @@ -217,6 +219,10 @@ void bli_l3_thread_decorator // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); +#ifdef PRINT_THRINFO + thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) ); +#endif + _Pragma( "omp parallel num_threads(n_threads)" ) { dim_t id = omp_get_thread_num(); @@ -245,13 +251,24 @@ void bli_l3_thread_decorator // Free the control tree, if one was created locally. bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); +#ifdef PRINT_THRINFO + threads[id] = thread; +#else // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); +#endif } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called above). + + +#ifdef PRINT_THRINFO + bli_l3_thrinfo_print_paths( threads ); + bli_l3_thrinfo_free_paths( threads ); + exit(1); +#endif } #endif From 970745a5fc7c29de3e202988e5eb104fabca4fdc Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 19 Oct 2016 15:58:03 -0500 Subject: [PATCH 23/27] Reorganized typedefs to avoid compiler warnings. Details: - Relocated membrk_t definition from bli_membrk.h to bli_type_defs.h. - Moved #include of bli_malloc.h from blis.h to bli_type_defs.h. - Removed standalone mtx_t and mutex_t typedefs in bli_type_defs.h. - Moved #include of bli_mutex.h from bli_thread.h to bli_typedefs.h. - The redundant typedefs of membrk_t and mtx_t caused a warning on some C compilers. Thanks to Tyler Smith for reporting this issue. --- frame/base/bli_membrk.h | 10 ---------- frame/include/bli_type_defs.h | 13 ++++++------- frame/include/blis.h | 1 - frame/thread/bli_thread.h | 3 --- 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/frame/base/bli_membrk.h b/frame/base/bli_membrk.h index 5db956344..cce0f4c1a 100644 --- a/frame/base/bli_membrk.h +++ b/frame/base/bli_membrk.h @@ -36,16 +36,6 @@ #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H -// -- Memory broker object type -- - -typedef struct membrk_s -{ - pool_t pools[3]; - mtx_t mutex; - - malloc_ft malloc_fp; - free_ft free_fp; -} membrk_t; #define bli_membrk_pool( pool_index, membrk_p ) \ \ diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 99b2c601d..d3548031c 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -715,10 +715,6 @@ typedef enum // -- BLIS misc. structure types ----------------------------------------------- // -// -- Mutex type -- - -typedef struct mtx_s mtx_t; - // -- Pool block type -- typedef struct @@ -741,10 +737,14 @@ typedef struct siz_t align_size; } pool_t; +// -- Mutex object type -- + +#include "bli_mutex.h" +#include "bli_malloc.h" + // -- Memory broker object type -- -typedef struct membrk_s membrk_t; -/* +typedef struct membrk_s { pool_t pools[3]; mtx_t mutex; @@ -752,7 +752,6 @@ typedef struct membrk_s membrk_t; malloc_ft malloc_fp; free_ft free_fp; } membrk_t; -*/ // -- Memory object type -- diff --git a/frame/include/blis.h b/frame/include/blis.h index 0eaaf413f..73f8fb20a 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -98,7 +98,6 @@ extern "C" { #include "bli_init.h" #include "bli_const.h" -#include "bli_malloc.h" #include "bli_obj.h" #include "bli_obj_scalar.h" #include "bli_cntx.h" diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 5b9443587..78135e426 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -52,9 +52,6 @@ #define BLIS_ENABLE_MULTITHREADING #endif -// Include thread mutex (mtx_t) object definitions and prototypes. -#include "bli_mutex.h" - // Include thread communicator (thrcomm_t) object definitions and prototypes. #include "bli_thrcomm.h" From 8feb0f85a674e84bec2417486e3bcea584b14c04 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 19 Oct 2016 16:05:41 -0500 Subject: [PATCH 24/27] Removed auto-prototyping of malloc()/free() substitutes. Details: - Removed the header file, bli_malloc_prototypes.h, which automatically generated prototypes for the functions specified by the following cpp macros: BLIS_MALLOC_INTL BLIS_FREE_INTL BLIS_MALLOC_POOL BLIS_FREE_POOL BLIS_MALLOC_USER BLIS_FREE_USER These prototypes were originally provided primarily as a convenience to those developers who specified their own malloc()/free() substitutes for one or more of the following. However, we generated these prototypes regardless, even when the default values (malloc and free) of the macros above were used. A problem arose under certain circumstances (e.g., gcc in C++ mode on Linux with glibc) when including blis.h that stemmed from the "throw" specification which was added to the glibc's malloc() prototype, resulting in a prototype mismatch. Therefore, going forward, developers who specify their own custom malloc()/free() substitutes must also prototype those substitutes via bli_kernel.h. Thanks to Krzysztof Drewniak for reporting this bug, and Devin Matthews for researching the nature and potential solutions. --- frame/include/bli_malloc_prototypes.h | 50 --------------------------- frame/include/blis.h | 2 -- 2 files changed, 52 deletions(-) delete mode 100644 frame/include/bli_malloc_prototypes.h diff --git a/frame/include/bli_malloc_prototypes.h b/frame/include/bli_malloc_prototypes.h deleted file mode 100644 index e828f99aa..000000000 --- a/frame/include/bli_malloc_prototypes.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_MALLOC_PROTOTYPES_H -#define BLIS_MALLOC_PROTOTYPES_H - -// Generate prototypes for each of the malloc() and free() functions -// defined in BLIS - -void* BLIS_MALLOC_POOL( size_t size ); -void BLIS_FREE_POOL( void* p ); - -void* BLIS_MALLOC_INTL( size_t size ); -void BLIS_FREE_INTL( void* p ); - -void* BLIS_MALLOC_USER( size_t size ); -void BLIS_FREE_USER( void* p ); - -#endif diff --git a/frame/include/blis.h b/frame/include/blis.h index 73f8fb20a..6c8104e31 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -91,8 +91,6 @@ extern "C" { #include "bli_kernel_prototypes.h" -#include "bli_malloc_prototypes.h" - // -- Base operation prototypes -- From 936d5fdc26c6c4dab199a8d11fde948975cfa1d6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 21 Oct 2016 14:34:27 -0500 Subject: [PATCH 25/27] Fixed multithreading compilation bug in 970745a. Details: - Moved the definition of the cpp macro BLIS_ENABLE_MULTITHREADING from bli_thread.h to bli_config_macro_defs.h. Also moved the sanity check that OpenMP and POSIX threads are not both enabled. - Thanks to Krzysztof Drewniak for reporting this bug. --- frame/include/bli_config_macro_defs.h | 16 ++++++++++++++++ frame/thread/bli_thread.h | 16 ---------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 577a4f5f8..e66851194 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -82,6 +82,22 @@ // Default behavior is disabled. #endif +// Perform a sanity check to make sure the user doesn't try to enable +// both OpenMP and pthreads. +#if defined ( BLIS_ENABLE_OPENMP ) && \ + defined ( BLIS_ENABLE_PTHREADS ) + #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." +#endif + +// Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP +// or pthreads are enabled. This macro is useful in situations when +// we want to detect use of either OpenMP or pthreads (as opposed +// to neither being used). +#if defined ( BLIS_ENABLE_OPENMP ) || \ + defined ( BLIS_ENABLE_PTHREADS ) + #define BLIS_ENABLE_MULTITHREADING +#endif + // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 78135e426..c5aa544fb 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -36,22 +36,6 @@ #ifndef BLIS_THREAD_H #define BLIS_THREAD_H -// Perform a sanity check to make sure the user doesn't try to enable -// both OpenMP and pthreads. -#if defined ( BLIS_ENABLE_OPENMP ) && \ - defined ( BLIS_ENABLE_PTHREADS ) - #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." -#endif - -// Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP -// or pthreads are enabled. This macro is useful in situations when -// we want to detect use of either OpenMP or pthreads (as opposed -// to neither being used). -#if defined ( BLIS_ENABLE_OPENMP ) || \ - defined ( BLIS_ENABLE_PTHREADS ) - #define BLIS_ENABLE_MULTITHREADING -#endif - // Include thread communicator (thrcomm_t) object definitions and prototypes. #include "bli_thrcomm.h" From 5117d444f7f3a2bc327f067926eaf2398212edda Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 24 Oct 2016 16:20:47 -0500 Subject: [PATCH 26/27] Change .align to .p2align in Bulldozer ukernels Apparently OSX doesn't allow .align directives for >16B, so I've changed these to their .p2align counterparts. --- kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c b/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c index 4aad807d2..fc7f750b4 100644 --- a/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c +++ b/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c @@ -763,7 +763,7 @@ void bli_sgemm_asm_8x8_fma4 #undef KERNEL4x6_4 #define KERNEL4x6_1(xx) \ - ".align 4 \n\t"\ + ".p2align 2 \n\t"\ "vmovddup -8 * 8(%%rax), %%xmm0 \n\t"\ "vfmaddpd %%xmm4, %%xmm1, %%xmm0, %%xmm4 \n\t"\ "vfmaddpd %%xmm5, %%xmm2, %%xmm0, %%xmm5 \n\t"\ @@ -888,7 +888,7 @@ void bli_dgemm_asm_4x6_fma4 "testq %%rsi, %%rsi \n\t" "je .CONSIDERKLEFT \n\t" " \n\t" - ".align 32 \n\t" + ".p2align 5 \n\t" ".LOOPKITER: \n\t" // MAIN LOOP " \n\t" KERNEL4x6_1(xx) From 0662a3c1b1f4644a86bf8e5073d1391808c91b4a Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 25 Oct 2016 12:42:44 -0500 Subject: [PATCH 27/27] Add flexible options for thread model (pthread/posix for pthreads etc.). --- configure | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 820857334..3a1e296a7 100755 --- a/configure +++ b/configure @@ -506,11 +506,14 @@ main() echo "${script_name}: using OpenMP for threading." enable_openmp='yes' enable_openmp_01=1 - elif [ "x${threading_model}" = "xpthreads" ]; then + elif [ "x${threading_model}" = "xpthreads" ] || + [ "x${threading_model}" = "xpthread" ] || + [ "x${threading_model}" = "xposix" ]; then echo "${script_name}: using Pthreads for threading." enable_pthreads='yes' enable_pthreads_01=1 - elif [ "x${threading_model}" = "xno" ]; then + elif [ "x${threading_model}" = "xno" ] || + [ "x${threading_model}" = "xnone" ]; then echo "${script_name}: threading is disabled." else echo "Unsupported threading model: ${threading_model}."