diff --git a/README.md b/README.md index 722893b12..16789103a 100644 --- a/README.md +++ b/README.md @@ -7,16 +7,17 @@ Introduction ------------ BLIS is a portable software framework for instantiating high-performance -BLAS-like dense linear algebra libraries. The framework was designed to -isolate essential kernels of computation that, when optimized, immediately -enable optimized implementations of most of its commonly used and -computationally intensive operations. BLIS is written in [ISO +BLAS-like dense linear algebra libraries. The framework was designed to isolate +essential kernels of computation that, when optimized, immediately enable +optimized implementations of most of its commonly used and computationally +intensive operations. BLIS is written in [ISO C99](http://en.wikipedia.org/wiki/C99) and available under a [new/modified/3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause). While BLIS exports a -[new BLAS-like API](), it also includes a BLAS compatibility layer which gives -application developers access to BLIS implementations via traditional [BLAS -routine calls](http://www.netlib.org/lapack/lug/node145.html). +[new BLAS-like API](https://github.com/flame/blis/wiki/BLISAPIQuickReference), +it also includes a BLAS compatibility layer which gives application developers +access to BLIS implementations via traditional [BLAS routine +calls](http://www.netlib.org/lapack/lug/node145.html). For a thorough presentation of our framework, please read our recently accepted journal article, ["BLIS: A Framework for Rapidly Instantiating BLAS diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 169b6101d..71df53ad7 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -125,6 +125,18 @@ #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif +// zgemm micro-kernel + +#if 1 +#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 +#define BLIS_DEFAULT_MC_Z 72 +#define BLIS_DEFAULT_KC_Z 256 +#define BLIS_DEFAULT_NC_Z 4080 +#define BLIS_DEFAULT_MR_Z 3 +#define BLIS_DEFAULT_NR_Z 4 + +#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif // -- trsm-related -- diff --git a/configure b/configure index e0dc82c89..e5e3b21e7 100755 --- a/configure +++ b/configure @@ -91,7 +91,7 @@ print_usage() echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" echo " " echo " Enable threading in the library, using threading model" - echo " MODEL={omp,pthreads,no}. If MODEL=no or " + echo " MODEL={openmp,pthreads,no}. If MODEL=no or " echo " --disable-threading is specified, threading will be" echo " disabled. The default is 'no'." echo " " @@ -486,13 +486,18 @@ main() # Check the threading model flag. +<<<<<<< HEAD +======= + # NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred. +>>>>>>> origin/master enable_openmp='no' enable_openmp_01=0 enable_pthreads='no' enable_pthreads_01=0 if [ "x${threading_model}" = "xauto" ]; then echo "${script_name}: determining the threading model automatically." - elif [ "x${threading_model}" = "xomp" ]; then + elif [ "x${threading_model}" = "xopenmp" ] || + [ "x${threading_model}" = "xomp" ]; then echo "${script_name}: using OpenMP for threading." enable_openmp='yes' enable_openmp_01=1 diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c index da47a6fd5..fc1c4c71a 100644 --- a/frame/0/bli_l0_check.c +++ b/frame/0/bli_l0_check.c @@ -99,8 +99,8 @@ void bli_getsc_check // Check object datatypes. - e_val = bli_check_noninteger_object( chi ); - bli_check_error_code( e_val ); + //e_val = bli_check_noninteger_object( chi ); + //bli_check_error_code( e_val ); // Check object dimensions. @@ -125,8 +125,8 @@ void bli_setsc_check // Check object datatypes. - e_val = bli_check_floating_object( chi ); - bli_check_error_code( e_val ); + //e_val = bli_check_floating_object( chi ); + //bli_check_error_code( e_val ); // Check object dimensions. diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c index d20f8ea45..3858e05b7 100644 --- a/frame/0/bli_l0_oapi.c +++ b/frame/0/bli_l0_oapi.c @@ -198,8 +198,8 @@ void PASTEMAC0(opname) \ if ( bli_is_constant( dt_chi ) ) dt_use = dt_def; \ else dt_use = dt_chi; \ \ - /* Invoke the typed function. */ \ - bli_call_ft_3 \ + /* Invoke the typed function (with integer support). */ \ + bli_call_ft_3i \ ( \ dt_use, \ opname, \ @@ -229,8 +229,8 @@ void PASTEMAC0(opname) \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \ \ - /* Invoke the typed function. */ \ - bli_call_ft_3 \ + /* Invoke the typed function (with integer support). */ \ + bli_call_ft_3i \ ( \ dt_chi, \ opname, \ diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c index 53f5be271..028a12cbd 100644 --- a/frame/0/bli_l0_tapi.c +++ b/frame/0/bli_l0_tapi.c @@ -227,3 +227,25 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNCR_BASIC0( zipsc ) +// ----------------------------------------------------------------------------- + +void bli_igetsc + ( + dim_t* chi, + double* zeta_r, + double* zeta_i + ) +{ + PASTEMAC2(i,d,gets)( *chi, *zeta_r, *zeta_i ); +} + +void bli_isetsc + ( + double zeta_r, + double zeta_i, + dim_t* chi + ) +{ + PASTEMAC2(d,i,sets)( zeta_r, zeta_i, *chi ); +} + diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h index 678e27292..36b282824 100644 --- a/frame/0/bli_l0_tapi.h +++ b/frame/0/bli_l0_tapi.h @@ -141,3 +141,19 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROTR_BASIC( zipsc ) +// ----------------------------------------------------------------------------- + +void bli_igetsc + ( + dim_t* chi, + double* zeta_r, + double* zeta_i + ); + +void bli_isetsc + ( + double zeta_r, + double zeta_i, + dim_t* chi + ); + diff --git a/frame/1/bli_l1v.h b/frame/1/bli_l1v.h index f557118f0..bd4879247 100644 --- a/frame/1/bli_l1v.h +++ b/frame/1/bli_l1v.h @@ -46,12 +46,14 @@ #include "bli_l1v_tapi.h" // Pack-related -#include "bli_packv.h" -#include "bli_unpackv.h" +// NOTE: packv and unpackv are temporarily disabled. +//#include "bli_packv.h" +//#include "bli_unpackv.h" // Other -#include "bli_scalv_cntl.h" -#include "bli_scalv_int.h" +// NOTE: scalv control tree code is temporarily disabled. +//#include "bli_scalv_cntl.h" +//#include "bli_scalv_int.h" // Reference kernel headers #include "bli_l1v_ref.h" diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c index b998a65fb..54c856b45 100644 --- a/frame/1/bli_l1v_check.c +++ b/frame/1/bli_l1v_check.c @@ -56,6 +56,21 @@ GENFRONT( subv ) GENFRONT( swapv ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* index \ + ) \ +{ \ + bli_l1v_xi_check( x, index ); \ +} + +GENFRONT( amaxv ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ @@ -481,3 +496,39 @@ void bli_l1v_ax_check bli_check_error_code( e_val ); } +void bli_l1v_xi_check + ( + obj_t* x, + obj_t* index + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_integer_object( index ); + bli_check_error_code( e_val ); + + e_val = bli_check_nonconstant_object( index ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_vector_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( index ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( index ); + bli_check_error_code( e_val ); +} + diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h index d4a1e9ff9..ddfe6a050 100644 --- a/frame/1/bli_l1v_check.h +++ b/frame/1/bli_l1v_check.h @@ -44,7 +44,7 @@ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ - ); + ); GENTPROT( addv ) GENTPROT( copyv ) @@ -52,6 +52,18 @@ GENTPROT( subv ) GENTPROT( swapv ) +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* index \ + ); + +GENTPROT( amaxv ) + + #undef GENTPROT #define GENTPROT( opname ) \ \ @@ -74,7 +86,7 @@ void PASTEMAC(opname,_check) \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ - ); + ); GENTPROT( axpyv ) GENTPROT( scal2v ) @@ -88,7 +100,7 @@ void PASTEMAC(opname,_check) \ obj_t* x, \ obj_t* y, \ obj_t* rho \ - ); + ); GENTPROT( dotv ) @@ -103,7 +115,7 @@ void PASTEMAC(opname,_check) \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ - ); + ); GENTPROT( dotxv ) @@ -114,7 +126,7 @@ GENTPROT( dotxv ) void PASTEMAC(opname,_check) \ ( \ obj_t* x \ - ); + ); GENTPROT( invertv ) @@ -126,7 +138,7 @@ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ - ); + ); GENTPROT( scalv ) GENTPROT( setv ) @@ -196,3 +208,9 @@ void bli_l1v_ax_check obj_t* x ); +void bli_l1v_xi_check + ( + obj_t* x, + obj_t* index + ); + diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index a1bba0354..bdbb0063f 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -55,6 +55,7 @@ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ } GENFRONT( addv, BLIS_ADDV_KER ) +GENFRONT( amaxv, BLIS_AMAXV_KER ) GENFRONT( copyv, BLIS_COPYV_KER ) GENFRONT( dotv, BLIS_DOTV_KER ) GENFRONT( dotxv, BLIS_DOTXV_KER ) diff --git a/frame/1/bli_l1v_cntx.h b/frame/1/bli_l1v_cntx.h index a8c16d342..95cd4a131 100644 --- a/frame/1/bli_l1v_cntx.h +++ b/frame/1/bli_l1v_cntx.h @@ -44,6 +44,7 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addv ) +GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( copyv ) diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h index c4e206df7..b2b80e016 100644 --- a/frame/1/bli_l1v_ft.h +++ b/frame/1/bli_l1v_ft.h @@ -58,6 +58,21 @@ INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) +// amaxv + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); + +INSERT_GENTDEF( amaxv ) + // axpbyv #undef GENTDEF diff --git a/frame/1/bli_l1v_ker.h b/frame/1/bli_l1v_ker.h index cf80eda46..8039905b7 100644 --- a/frame/1/bli_l1v_ker.h +++ b/frame/1/bli_l1v_ker.h @@ -54,6 +54,20 @@ INSERT_GENTPROT_BASIC( copyv_ker_name ) INSERT_GENTPROT_BASIC( subv_ker_name ) +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); \ + +INSERT_GENTPROT_BASIC( amaxv_ker_name ) + + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c index cebc3bfb5..67525d68c 100644 --- a/frame/1/bli_l1v_oapi.c +++ b/frame/1/bli_l1v_oapi.c @@ -82,6 +82,44 @@ GENFRONT( copyv ) GENFRONT( subv ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* index \ + BLIS_OAPI_CNTX_PARAM \ + ) \ +{ \ + BLIS_OAPI_CNTX_DECL \ +\ + num_t dt = bli_obj_datatype( *x ); \ +\ + dim_t n = bli_obj_vector_dim( *x ); \ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_index = bli_obj_buffer_at_off( *index ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( x, index ); \ +\ + /* Invoke the typed function. */ \ + bli_call_ft_5 \ + ( \ + dt, \ + opname, \ + n, \ + buf_x, incx, \ + buf_index, \ + cntx \ + ); \ +} + +GENFRONT( amaxv ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h index ff277421c..1c7e534da 100644 --- a/frame/1/bli_l1v_oapi.h +++ b/frame/1/bli_l1v_oapi.h @@ -52,6 +52,19 @@ GENTPROT( copyv ) GENTPROT( subv ) +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* index \ + BLIS_OAPI_CNTX_PARAM \ + ); + +GENTPROT( amaxv ) + + #undef GENTPROT #define GENTPROT( opname ) \ \ diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 4cf6be24e..74a548eea 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -74,6 +74,38 @@ INSERT_GENTFUNC_BASIC( copyv, BLIS_COPYV_KER ) INSERT_GENTFUNC_BASIC( subv, BLIS_SUBV_KER ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, kerid ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* index, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ +\ + bli_cntx_init_local_if( opname, cntx, cntx_p ); \ +\ + PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ +\ + f \ + ( \ + n, \ + x, incx, \ + index, \ + cntx_p \ + ); \ +\ + bli_cntx_finalize_local_if( opname, cntx ); \ +} + +INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h index b4b36b059..86cdf416d 100644 --- a/frame/1/bli_l1v_tapi.h +++ b/frame/1/bli_l1v_tapi.h @@ -40,6 +40,9 @@ #undef addv_ker_name #define addv_ker_name addv +#undef amaxv_ker_name +#define amaxv_ker_name amaxv + #undef axpbyv_ker_name #define axpbyv_ker_name axpbyv diff --git a/frame/1/kernels/bli_amaxv_ref.c b/frame/1/kernels/bli_amaxv_ref.c new file mode 100644 index 000000000..f207b799f --- /dev/null +++ b/frame/1/kernels/bli_amaxv_ref.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define BLAS-like interfaces with typed operands. +// + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* i_max, \ + cntx_t* cntx \ + ) \ +{ \ + ctype_r* minus_one = PASTEMAC(chr,m1); \ + dim_t* zero_i = PASTEMAC(i,0); \ +\ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r abs_chi1; \ + ctype_r abs_chi1_max; \ + dim_t i; \ +\ + /* Initialize the index of the maximum absolute value to zero. */ \ + PASTEMAC(i,copys)( zero_i, *i_max ); \ +\ + /* If the vector length is zero, return early. This directly emulates + the behavior of netlib BLAS's i?amax() routines. */ \ + if ( bli_zero_dim1( n ) ) return; \ +\ + /* Initialize the maximum absolute value search candidate with + -1, which is guaranteed to be less than all values we will + compute. */ \ + PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ +\ + if ( incx == 1 ) \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ + else \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + ctype* chi1 = x + (i )*incx; \ +\ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCR_BASIC0( amaxv_ref ) + diff --git a/frame/1/packv/bli_packv.c b/frame/1/other/packv/bli_packv.c similarity index 100% rename from frame/1/packv/bli_packv.c rename to frame/1/other/packv/bli_packv.c diff --git a/frame/1/packv/bli_packv.h b/frame/1/other/packv/bli_packv.h similarity index 100% rename from frame/1/packv/bli_packv.h rename to frame/1/other/packv/bli_packv.h diff --git a/frame/1/packv/bli_packv_check.c b/frame/1/other/packv/bli_packv_check.c similarity index 100% rename from frame/1/packv/bli_packv_check.c rename to frame/1/other/packv/bli_packv_check.c diff --git a/frame/1/packv/bli_packv_check.h b/frame/1/other/packv/bli_packv_check.h similarity index 100% rename from frame/1/packv/bli_packv_check.h rename to frame/1/other/packv/bli_packv_check.h diff --git a/frame/1/packv/bli_packv_cntl.c b/frame/1/other/packv/bli_packv_cntl.c similarity index 75% rename from frame/1/packv/bli_packv_cntl.c rename to frame/1/other/packv/bli_packv_cntl.c index ac068ce71..13f90a429 100644 --- a/frame/1/packv/bli_packv_cntl.c +++ b/frame/1/other/packv/bli_packv_cntl.c @@ -34,6 +34,7 @@ #include "blis.h" +#if 0 packv_t* packv_cntl = NULL; void bli_packv_cntl_init( void ) @@ -77,4 +78,41 @@ void bli_packv_cntl_obj_init( packv_t* cntl, cntl->bmid = bmid; cntl->pack_schema = pack_schema; } +#endif + +cntl_t* bli_packv_cntl_obj_create + ( + void* var_func, + void* packv_var_func, + bszid_t bmid, + pack_t pack_schema, + cntl_t* sub_node + ) +{ + cntl_t* cntl; + packv_params_t* params; + + // Allocate a packv_params_t struct. + params = bli_malloc_intl( sizeof( packv_params_t ) ); + + // Initialize the packv_params_t struct. + params->size = sizeof( packv_params_t ); + params->packv_var_func = packv_var_func; + params->bmid = bmid; + params->pack_schema = pack_schema; + + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); + + return cntl; +} diff --git a/frame/1/other/packv/bli_packv_cntl.h b/frame/1/other/packv/bli_packv_cntl.h new file mode 100644 index 000000000..1fc265338 --- /dev/null +++ b/frame/1/other/packv/bli_packv_cntl.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +struct packv_params_s +{ + uint64_t size + packv_voft* var_func; + bszid_t bmid; + pack_t pack_schema; +}; +typedef struct packv_params_s packv_params_t; + + +#define bli_cntl_packv_params_var_func( cntl ) \ +\ + ( (packv_params_t*)( cntl->params )->var_func ) + +#define bli_cntl_packv_params_bmid( cntl ) \ +\ + ( (packv_params_t*)( cntl->params )->bmid_m ) + +#define bli_cntl_packv_params_pack_schema( cntl ) \ +\ + ( (packv_params_t*)( cntl->params )->pack_schema ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_packv_cntl_obj_create + ( + void* var_func, + void* packv_var_func, + bszid_t bmid, + pack_t pack_schema, + cntl_t* sub_node + ); + diff --git a/frame/1/packv/bli_packv_init.c b/frame/1/other/packv/bli_packv_init.c similarity index 70% rename from frame/1/packv/bli_packv_init.c rename to frame/1/other/packv/bli_packv_init.c index c43931272..01b8f3cdd 100644 --- a/frame/1/packv/bli_packv_init.c +++ b/frame/1/other/packv/bli_packv_init.c @@ -52,7 +52,6 @@ void bli_packv_init pack_t pack_schema; bszid_t bmult_id; - obj_t c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -84,26 +83,6 @@ void bli_packv_init // left is whether we are to typecast vector a before packing. if ( bli_obj_datatype( *a ) != bli_obj_target_datatype( *a ) ) bli_abort(); -/* - { - // Initialize an object c for the intermediate typecast vector. - bli_packv_init_cast( a, - p, - &c ); - - // Copy/typecast vector a to vector c. - bli_copyv( a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // vector serves as a minor optimization. This causes the packv - // implementation to pack directly from vector a. - bli_obj_alias_to( *a, c ); - } - // Extract various fields from the control tree and pass them in // explicitly into _init_pack(). This allows external code generators @@ -116,7 +95,7 @@ void bli_packv_init ( pack_schema, bmult_id, - &c, + &a, p, cntx ); @@ -125,22 +104,24 @@ void bli_packv_init } -void bli_packv_init_pack +siz_t bli_packv_init_pack ( - pack_t pack_schema, + pack_t schema, bszid_t bmult_id, - obj_t* c, + obj_t* a, obj_t* p, cntx_t* cntx ) { - num_t dt = bli_obj_datatype( *c ); - dim_t dim_c = bli_obj_vector_dim( *c ); + num_t dt = bli_obj_datatype( *a ); + dim_t dim_a = bli_obj_vector_dim( *a ); dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); membrk_t* membrk = bli_cntx_membrk( cntx ); +#if 0 mem_t* mem_p; +#endif dim_t m_p_pad; siz_t size_p; inc_t rs_p, cs_p; @@ -148,21 +129,17 @@ void bli_packv_init_pack // We begin by copying the basic fields of c. - bli_obj_alias_to( *c, *p ); + bli_obj_alias_to( *a, *p ); // Update the dimensions. - bli_obj_set_dims( dim_c, 1, *p ); + bli_obj_set_dims( dim_a, 1, *p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, *p ); // Set the pack schema in the p object to the value in the control tree // node. - bli_obj_set_pack_schema( pack_schema, *p ); - - // Extract the address of the mem_t object within p that will track - // properties of the packed buffer. - mem_p = bli_obj_pack_mem( *p ); + bli_obj_set_pack_schema( schema, *p ); // Compute the dimensions padded by the dimension multiples. m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( *p ), bmult ); @@ -170,6 +147,11 @@ void bli_packv_init_pack // Compute the size of the packed buffer. size_p = m_p_pad * 1 * bli_obj_elem_size( *p ); +#if 0 + // Extract the address of the mem_t object within p that will track + // properties of the packed buffer. + mem_p = bli_obj_pack_mem( *p ); + if ( bli_mem_is_unalloc( mem_p ) ) { // If the mem_t object of p has not yet been allocated, then acquire @@ -192,19 +174,19 @@ void bli_packv_init_pack } } - // Save the padded (packed) dimensions into the packed object. - bli_obj_set_padded_dims( m_p_pad, 1, *p ); - // Grab the buffer address from the mem_t object and copy it to the // main object buffer field. (Sometimes this buffer address will be // copied when the value is already up-to-date, because it persists // in the main object buffer field across loop iterations.) buf = bli_mem_buffer( mem_p ); bli_obj_set_buffer( buf, *p ); +#endif + // Save the padded (packed) dimensions into the packed object. + bli_obj_set_padded_dims( m_p_pad, 1, *p ); // Set the row and column strides of p based on the pack schema. - if ( pack_schema == BLIS_PACKED_VECTOR ) + if ( schema == BLIS_PACKED_VECTOR ) { // Set the strides to reflect a column-stored vector. Note that the // column stride may never be used, and is only useful to determine @@ -215,8 +197,11 @@ void bli_packv_init_pack bli_obj_set_strides( rs_p, cs_p, *p ); } + + return size_p; } +#if 0 void bli_packv_release ( obj_t* p, @@ -226,52 +211,4 @@ void bli_packv_release if ( !bli_cntl_is_noop( cntl ) ) bli_obj_release_pack( p ); } - - -/* -void bli_packv_init_cast( obj_t* a, - obj_t* p, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) object c is marked as being stored in a standard, contiguous - // format (ie: a column vector), - // (3) the view offset of c is reset to (0,0), and - // (4) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available. (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t dim_a = bli_obj_vector_dim( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Update the dimensions. - bli_obj_set_dims( dim_a, 1, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect a column storage. - // Note that the column stride should never be used. - bli_obj_set_strides( 1, dim_a, *c ); -} -*/ - +#endif diff --git a/frame/1/packv/bli_packv_init.h b/frame/1/other/packv/bli_packv_init.h similarity index 88% rename from frame/1/packv/bli_packv_init.h rename to frame/1/other/packv/bli_packv_init.h index 03d12903c..6104bbdc7 100644 --- a/frame/1/packv/bli_packv_init.h +++ b/frame/1/other/packv/bli_packv_init.h @@ -40,23 +40,12 @@ void bli_packv_init packv_t* cntl ); -void bli_packv_init_pack +siz_t bli_packv_init_pack ( pack_t pack_schema, bszid_t bmult_id, - obj_t* c, + obj_t* a, obj_t* p, cntx_t* cntx ); -void bli_packv_release - ( - obj_t* p, - packv_t* cntl - ); - -/* -void bli_packv_init_cast( obj_t* a, - obj_t* p, - obj_t* c ); -*/ diff --git a/frame/1/packv/bli_packv_int.c b/frame/1/other/packv/bli_packv_int.c similarity index 85% rename from frame/1/packv/bli_packv_int.c rename to frame/1/other/packv/bli_packv_int.c index d22f0113e..75cbd193c 100644 --- a/frame/1/packv/bli_packv_int.c +++ b/frame/1/other/packv/bli_packv_int.c @@ -47,27 +47,23 @@ static FUNCPTR_T vars[1][3] = { bli_packv_unb_var1, NULL, NULL } }; -void bli_packv_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packv_t* cntl ) +void bli_packv_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ) { - // The packv operation consists of an optional typecasting pre-process. - // Here are the following possible ways packv can execute: - // 1. cast and pack: When typecasting and packing are both - // precribed, typecast a to temporary vector c and then pack - // c to p. - // 2. pack only: Typecasting is skipped when it is not needed; - // simply pack a directly to p. - // 3. cast only: Not yet supported / not used. - // 4. no-op: The control tree sometimes directs us to skip the - // pack operation entirely. Alias p to a and return. - - //obj_t c; - +#if 0 varnum_t n; impl_t i; - FUNCPTR_T f; +#endif + packv_voft f; + +// !!! +// DEFINE packv_voft type. +// !!! // Check parameters. if ( bli_error_checking_is_enabled() ) diff --git a/frame/1/packv/bli_packv_int.h b/frame/1/other/packv/bli_packv_int.h similarity index 100% rename from frame/1/packv/bli_packv_int.h rename to frame/1/other/packv/bli_packv_int.h diff --git a/frame/1/packv/bli_packv_unb_var1.c b/frame/1/other/packv/bli_packv_unb_var1.c similarity index 100% rename from frame/1/packv/bli_packv_unb_var1.c rename to frame/1/other/packv/bli_packv_unb_var1.c diff --git a/frame/1/packv/bli_packv_unb_var1.h b/frame/1/other/packv/bli_packv_unb_var1.h similarity index 100% rename from frame/1/packv/bli_packv_unb_var1.h rename to frame/1/other/packv/bli_packv_unb_var1.h diff --git a/frame/1/scalv/bli_scalv_cntl.c b/frame/1/other/scalv/bli_scalv_cntl.c similarity index 100% rename from frame/1/scalv/bli_scalv_cntl.c rename to frame/1/other/scalv/bli_scalv_cntl.c diff --git a/frame/1/scalv/bli_scalv_cntl.h b/frame/1/other/scalv/bli_scalv_cntl.h similarity index 100% rename from frame/1/scalv/bli_scalv_cntl.h rename to frame/1/other/scalv/bli_scalv_cntl.h diff --git a/frame/1/scalv/bli_scalv_int.c b/frame/1/other/scalv/bli_scalv_int.c similarity index 100% rename from frame/1/scalv/bli_scalv_int.c rename to frame/1/other/scalv/bli_scalv_int.c diff --git a/frame/1/scalv/bli_scalv_int.h b/frame/1/other/scalv/bli_scalv_int.h similarity index 100% rename from frame/1/scalv/bli_scalv_int.h rename to frame/1/other/scalv/bli_scalv_int.h diff --git a/frame/1/unpackv/bli_unpackv.c b/frame/1/other/unpackv/bli_unpackv.c similarity index 100% rename from frame/1/unpackv/bli_unpackv.c rename to frame/1/other/unpackv/bli_unpackv.c diff --git a/frame/1/unpackv/bli_unpackv.h b/frame/1/other/unpackv/bli_unpackv.h similarity index 100% rename from frame/1/unpackv/bli_unpackv.h rename to frame/1/other/unpackv/bli_unpackv.h diff --git a/frame/1/unpackv/bli_unpackv_check.c b/frame/1/other/unpackv/bli_unpackv_check.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_check.c rename to frame/1/other/unpackv/bli_unpackv_check.c diff --git a/frame/1/unpackv/bli_unpackv_check.h b/frame/1/other/unpackv/bli_unpackv_check.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_check.h rename to frame/1/other/unpackv/bli_unpackv_check.h diff --git a/frame/1/unpackv/bli_unpackv_cntl.c b/frame/1/other/unpackv/bli_unpackv_cntl.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_cntl.c rename to frame/1/other/unpackv/bli_unpackv_cntl.c diff --git a/frame/1/unpackv/bli_unpackv_cntl.h b/frame/1/other/unpackv/bli_unpackv_cntl.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_cntl.h rename to frame/1/other/unpackv/bli_unpackv_cntl.h diff --git a/frame/1/unpackv/bli_unpackv_int.c b/frame/1/other/unpackv/bli_unpackv_int.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_int.c rename to frame/1/other/unpackv/bli_unpackv_int.c diff --git a/frame/1/unpackv/bli_unpackv_int.h b/frame/1/other/unpackv/bli_unpackv_int.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_int.h rename to frame/1/other/unpackv/bli_unpackv_int.h diff --git a/frame/1/unpackv/bli_unpackv_unb_var1.c b/frame/1/other/unpackv/bli_unpackv_unb_var1.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_unb_var1.c rename to frame/1/other/unpackv/bli_unpackv_unb_var1.c diff --git a/frame/1/unpackv/bli_unpackv_unb_var1.h b/frame/1/other/unpackv/bli_unpackv_unb_var1.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_unb_var1.h rename to frame/1/other/unpackv/bli_unpackv_unb_var1.h diff --git a/frame/1m/bli_l1m.h b/frame/1m/bli_l1m.h index ff9c98459..5c55b97d3 100644 --- a/frame/1m/bli_l1m.h +++ b/frame/1m/bli_l1m.h @@ -36,6 +36,7 @@ #include "bli_l1m_check.h" #include "bli_l1m_ft.h" +#include "bli_l1m_voft.h" // Prototype object APIs with and without contexts. #include "bli_oapi_w_cntx.h" @@ -51,6 +52,5 @@ #include "bli_unpackm.h" // Other -#include "bli_scalm_cntl.h" -#include "bli_scalm_int.h" +#include "bli_scalm.h" diff --git a/frame/1m/bli_l1m_voft.h b/frame/1m/bli_l1m_voft.h new file mode 100644 index 000000000..f5fdf5b65 --- /dev/null +++ b/frame/1m/bli_l1m_voft.h @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L1M_VAR_OFT_H +#define BLIS_L1M_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* p, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( packm ) + + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* p, \ + obj_t* a, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( unpackm ) + + + +#endif + diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index cc8e84b2d..4ce7b1504 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -93,10 +93,14 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = }; -void bli_packm_blk_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ) +void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -140,7 +144,7 @@ void bli_packm_blk_var1( obj_t* c, // whether we are executing an induced method. if ( bli_is_nat_packed( schema ) ) { - // This branch if for native execution, where we assume that + // This branch is for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform @@ -156,28 +160,25 @@ void bli_packm_blk_var1( obj_t* c, // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if( bli_thread_am_ochief( t ) ) + if ( bli_obj_scalar_has_nonzero_imag( p ) ) { - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { -//printf( "applying non-zero imag kappa\n" ); - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + //printf( "applying non-zero imag kappa\n" ); + + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); + + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); + + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; } - kappa_p = bli_thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); @@ -194,7 +195,12 @@ void bli_packm_blk_var1( obj_t* c, bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; #else - func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); + // The original idea here was to read the packm_ukr from the context + // if it is non-NULL. The problem is, it requires that we be able to + // assume that the packm_ukr field is initialized to NULL, which it + // currently is not. + + //func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); //if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) ) { @@ -203,7 +209,6 @@ void bli_packm_blk_var1( obj_t* c, // we use the default lookup table to determine the right func_t // for the current schema. const dim_t i = bli_pack_schema_index( schema ); -//printf( "bli_packm_blk_var1: pack schema index = %lu (schema = %x)\n", i, schema ); packm_kers = &packm_struc_cxk_kers[ i ]; } @@ -221,11 +226,6 @@ void bli_packm_blk_var1( obj_t* c, // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_get_dt( dt_cp, packm_kers ); - -//bli_cntx_print( cntx ); -//printf( "bli_packm_blk_var1: packm_ker = %p\n", packm_ker ); -//printf( "bli_packm_blk_var1: cntx_packm_ker = %p\n", cntx_packm_kers ); -//printf( "bli_packm_blk_var1: local_table_entry = %p\n", &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ] ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; @@ -598,6 +598,57 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ p_inc = ps_p; \ } \ \ +/* +if ( col_stored ) { \ + if ( bli_thread_work_id( thread ) == 0 ) \ + { \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ + if ( bli_thread_work_id( thread ) == 1 ) \ + { \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ +} \ +else { \ + if ( bli_thread_work_id( thread ) == 0 ) \ + { \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ + if ( bli_thread_work_id( thread ) == 1 ) \ + { \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ +} \ +*/ \ +\ /* if ( bli_is_4mi_packed( schema ) ) { \ printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 8971da5c0..4e04f86f9 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -32,10 +32,14 @@ */ -void bli_packm_blk_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ); +void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ); #undef GENTPROT diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c index 6a56b8676..f8c66eee5 100644 --- a/frame/1m/packm/bli_packm_check.c +++ b/frame/1m/packm/bli_packm_check.c @@ -35,9 +35,12 @@ #include "blis.h" -void bli_packm_init_check( obj_t* a, - obj_t* p, - cntx_t* cntx ) +void bli_packm_init_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { err_t e_val; @@ -54,9 +57,12 @@ void bli_packm_init_check( obj_t* a, //bli_check_error_code( e_val ); } -void bli_packm_int_check( obj_t* a, - obj_t* p, - cntx_t* cntx ) +void bli_packm_int_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { err_t e_val; diff --git a/frame/1m/packm/bli_packm_check.h b/frame/1m/packm/bli_packm_check.h index 9974ced6b..9b2e8a66e 100644 --- a/frame/1m/packm/bli_packm_check.h +++ b/frame/1m/packm/bli_packm_check.h @@ -32,10 +32,17 @@ */ -void bli_packm_init_check( obj_t* a, - obj_t* p, - cntx_t* cntx ); +void bli_packm_init_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ); + +void bli_packm_int_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ); -void bli_packm_int_check( obj_t* a, - obj_t* p, - cntx_t* cntx ); diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index f0f674615..67b01fffb 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -34,109 +34,49 @@ #include "blis.h" -packm_t* packm_cntl_row = NULL; -packm_t* packm_cntl_col = NULL; - -packm_t* packm_cntl = NULL; - -void bli_packm_cntl_init() +cntl_t* bli_packm_cntl_obj_create + ( + void* var_func, + void* packm_var_func, + bszid_t bmid_m, + bszid_t bmid_n, + bool_t does_invert_diag, + bool_t rev_iter_if_upper, + bool_t rev_iter_if_lower, + pack_t pack_schema, + packbuf_t pack_buf_type, + cntl_t* sub_node + ) { - // Generally speaking, the BLIS_PACKED_ROWS and BLIS_PACKED_COLUMNS - // are used by the level-2 operations. These schemas amount to simple - // copies to row or column storage. These simple schemas may be used - // by level-3 operations, but they should never be used for matrices - // with structure (since they do not densify). - // The BLIS_PACKED_ROW_PANELS and BLIS_PACKED_COL_PANELS schemas are - // used only in level-3 operations. They pack to (typically) skinny - // row and column panels, where the width of the panel is determined - // by register blocksizes. It is assumed that matrices with structure - // will be densified. + cntl_t* cntl; + packm_params_t* params; - // Create control trees to pack by rows. - packm_cntl_row - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to rows: - BLIS_VF, // used for m dimension - BLIS_VF, // used for n dimension - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_ROWS, - BLIS_BUFFER_FOR_GEN_USE ); + // Allocate a packm_params_t struct. + params = bli_malloc_intl( sizeof( packm_params_t ) ); + // Initialize the packm_params_t struct. + params->size = sizeof( packm_params_t ); + params->var_func = packm_var_func; + params->bmid_m = bmid_m; + params->bmid_n = bmid_n; + params->does_invert_diag = does_invert_diag; + params->rev_iter_if_upper = rev_iter_if_upper; + params->rev_iter_if_lower = rev_iter_if_lower; + params->pack_schema = pack_schema; + params->pack_buf_type = pack_buf_type; - // Create control trees to pack by columns. - packm_cntl_col - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to columns: - BLIS_VF, // used for m dimension - BLIS_VF, // used for n dimension - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_COLUMNS, - BLIS_BUFFER_FOR_GEN_USE ); - - - // Set defaults when we don't care whether the packing is by rows or - // by columns. - packm_cntl = packm_cntl_col; -} - -void bli_packm_cntl_finalize() -{ - bli_cntl_obj_free( packm_cntl_row ); - bli_cntl_obj_free( packm_cntl_col ); -} - -packm_t* bli_packm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ) -{ - packm_t* cntl; - - cntl = ( packm_t* ) bli_malloc_intl( sizeof(packm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bmid_m = bmid_m; - cntl->bmid_n = bmid_n; - cntl->does_invert_diag = does_invert_diag; - cntl->rev_iter_if_upper = rev_iter_if_upper; - cntl->rev_iter_if_lower = rev_iter_if_lower; - cntl->pack_schema = pack_schema; - cntl->pack_buf_type = pack_buf_type; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); return cntl; } -void bli_packm_cntl_obj_init( packm_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bmid_m = bmid_m; - cntl->bmid_n = bmid_n; - cntl->does_invert_diag = does_invert_diag; - cntl->rev_iter_if_upper = rev_iter_if_upper; - cntl->rev_iter_if_lower = rev_iter_if_lower; - cntl->pack_schema = pack_schema; - cntl->pack_buf_type = pack_buf_type; -} - diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 1dc31c543..057a512ed 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -32,56 +32,65 @@ */ -struct packm_s +struct packm_params_s { - impl_t impl_type; - varnum_t var_num; - bszid_t bmid_m; - bszid_t bmid_n; - bool_t does_invert_diag; - bool_t rev_iter_if_upper; - bool_t rev_iter_if_lower; - pack_t pack_schema; - packbuf_t pack_buf_type; + uint64_t size; // size field must be present and come first. + packm_voft var_func; + bszid_t bmid_m; + bszid_t bmid_n; + bool_t does_invert_diag; + bool_t rev_iter_if_upper; + bool_t rev_iter_if_lower; + pack_t pack_schema; + packbuf_t pack_buf_type; }; -typedef struct packm_s packm_t; +typedef struct packm_params_s packm_params_t; -#define cntl_bmid_m( cntl ) cntl->bmid_m -#define cntl_bmid_n( cntl ) cntl->bmid_n +#define bli_cntl_packm_params_var_func( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->var_func ) -#define cntl_does_invert_diag( cntl ) cntl->does_invert_diag -#define cntl_rev_iter_if_upper( cntl ) cntl->rev_iter_if_upper -#define cntl_rev_iter_if_lower( cntl ) cntl->rev_iter_if_lower -#define cntl_pack_schema( cntl ) cntl->pack_schema -#define cntl_pack_buf_type( cntl ) cntl->pack_buf_type +#define bli_cntl_packm_params_bmid_m( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->bmid_m ) -#define bli_cntl_sub_packm( cntl ) cntl->sub_packm -#define bli_cntl_sub_packm_a( cntl ) cntl->sub_packm_a -#define bli_cntl_sub_packm_a11( cntl ) cntl->sub_packm_a11 -#define bli_cntl_sub_packm_b( cntl ) cntl->sub_packm_b -#define bli_cntl_sub_packm_b11( cntl ) cntl->sub_packm_b11 -#define bli_cntl_sub_packm_c( cntl ) cntl->sub_packm_c -#define bli_cntl_sub_packm_c11( cntl ) cntl->sub_packm_c11 +#define bli_cntl_packm_params_bmid_n( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->bmid_n ) -void bli_packm_cntl_init( void ); -void bli_packm_cntl_finalize( void ); -packm_t* bli_packm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ); -void bli_packm_cntl_obj_init( packm_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ); +#define bli_cntl_packm_params_does_invert_diag( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->does_invert_diag ) + +#define bli_cntl_packm_params_rev_iter_if_upper( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_upper ) + +#define bli_cntl_packm_params_rev_iter_if_lower( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_lower ) + +#define bli_cntl_packm_params_pack_schema( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->pack_schema ) + +#define bli_cntl_packm_params_pack_buf_type( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->pack_buf_type ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_packm_cntl_obj_create + ( + void* var_func, + void* packm_var_func, + bszid_t bmid_m, + bszid_t bmid_n, + bool_t does_invert_diag, + bool_t rev_iter_if_upper, + bool_t rev_iter_if_lower, + pack_t pack_schema, + packbuf_t pack_buf_type, + cntl_t* sub_node + ); diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index d42abfd62..4f570400a 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -52,7 +52,7 @@ void bli_packm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx ); // Initialize the context with the global membrk object. - bli_cntx_set_membrk( bli_mem_global_membrk(), cntx ); + bli_cntx_set_membrk( bli_memsys_global_membrk(), cntx ); } void bli_packm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index c33a0410e..ccf88f3cb 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -35,38 +35,43 @@ #include "blis.h" -void bli_packm_init( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl ) +siz_t bli_packm_init + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ) { // The purpose of packm_init() is to initialize an object P so that // a source object A can be packed into P via one of the packm - // implementations. This initialization includes acquiring a suitable - // block of memory from the memory allocator, if such a block of memory - // has not already been allocated previously. + // implementations. This initialization precedes the acquisition of a + // suitable block of memory from the memory allocator (if such a block + // of memory has not already been allocated previously). - invdiag_t invert_diag; - pack_t schema; - packord_t pack_ord_if_up; - packord_t pack_ord_if_lo; - packbuf_t pack_buf_type; bszid_t bmult_id_m; bszid_t bmult_id_n; - obj_t c; + bool_t does_invert_diag; + bool_t rev_iter_if_upper; + bool_t rev_iter_if_lower; + //pack_t pack_schema; + packbuf_t pack_buf_type; + siz_t size_needed; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_init_check( a, p, cntx ); - // First check if we are to skip this operation because the control tree - // is NULL, and if so, simply alias the object to its packed counterpart. - if ( bli_cntl_is_noop( cntl ) ) - { - bli_obj_alias_to( *a, *p ); - return; - } + // Extract various fields from the control tree. + bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); + rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); + rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); + //pack_schema = bli_cntl_packm_params_pack_schema( cntl ); + pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); +#if 0 // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can alias the object and return. @@ -79,179 +84,150 @@ void bli_packm_init( obj_t* a, if ( bli_obj_pack_schema( *a ) == BLIS_PACKED_UNSPEC ) { bli_obj_alias_to( *a, *p ); - return; + return 0; } - // At this point, we can be assured that cntl is not NULL. Now we check - // if the object has already been packed to the desired schema (as en- - // coded in the control tree). If so, we can alias and return, as above. + // Now we check if the object has already been packed to the desired + // schema (as encoded in the control tree). If so, we can alias and + // return 0. // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED // and thus packing will be called for (but in some cases packing has // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) ) + if ( bli_obj_pack_schema( *a ) == pack_schema ) { bli_obj_alias_to( *a, *p ); - return; + return 0; } +#endif // If the object is marked as being filled with zeros, then we can skip - // the packm operation entirely and alias. Notice that we use pack-aware - // aliasing. This is needed because the object may have been packed in - // a previous iteration, which means the object currently contains the - // mem_t entry of an already-allocated block. bli_obj_alias_for_packing() - // will avoid overwriting that mem_t entry, which means it can be - // properly released later on. + // the packm operation entirely and alias. if ( bli_obj_is_zeros( *a ) ) { - bli_obj_alias_for_packing( *a, *p ); - return; + bli_obj_alias_to( *a, *p ); + return 0; } - // Now, if we are not skipping the pack operation, then the only question - // left is whether we are to typecast matrix a before packing. - if ( bli_obj_datatype( *a ) != bli_obj_target_datatype( *a ) ) - bli_abort(); -/* - { - // Initialize an object c for the intermediate typecast matrix. - bli_packm_init_cast( a, - p, - &c ); - - // Copy/typecast matrix a to matrix c. - bli_copym( a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // matrix serves as a minor optimization. This causes the packm - // implementation to pack directly from matrix a. - bli_obj_alias_to( *a, c ); - } - - - // Extract various fields from the control tree. - pack_buf_type = cntl_pack_buf_type( cntl ); - bmult_id_m = cntl_bmid_m( cntl ); - bmult_id_n = cntl_bmid_n( cntl ); - - // Extract the schema from the context, depending on whether we are + // We now ignore the pack_schema field in the control tree and + // extract the schema from the context, depending on whether we are // preparing to pack a block of A or panel of B. For A and B, we must // obtain the schema from the context since the induced methods reuse // the same control trees used by native execution, and those induced // methods specify the schema used by the current execution phase // within the context (whereas the control tree does not change). + pack_t schema; + if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) { schema = bli_cntx_get_pack_schema_a( cntx ); -//printf( "bli_packm_init: pack schema a = %x\n", schema ); } else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) { schema = bli_cntx_get_pack_schema_b( cntx ); -//printf( "bli_packm_init: pack schema b = %x\n", schema ); } else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { // If we get a request to pack C for some reason, it is likely // not part of an induced method, and so it would be safe (and // necessary) to read the pack schema from the control tree. - schema = cntl_pack_schema( cntl ); -//printf( "bli_packm_init: pack schema c = %x\n", schema ); + schema = bli_cntl_packm_params_pack_schema( cntl ); } // Prepare a few other variables based on properties of the control // tree. - if ( cntl_does_invert_diag( cntl ) ) invert_diag = BLIS_INVERT_DIAG; - else invert_diag = BLIS_NO_INVERT_DIAG; + invdiag_t invert_diag; + packord_t pack_ord_if_up; + packord_t pack_ord_if_lo; - if ( cntl_rev_iter_if_upper( cntl ) ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; - else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; + if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG; + else invert_diag = BLIS_NO_INVERT_DIAG; - if ( cntl_rev_iter_if_lower( cntl ) ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; - else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; + if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; + else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; + + if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; + else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; // Initialize object p for the final packed matrix. - bli_packm_init_pack( invert_diag, - schema, - pack_ord_if_up, - pack_ord_if_lo, - pack_buf_type, - bmult_id_m, - bmult_id_n, - &c, - p, - cntx ); + size_needed + = + bli_packm_init_pack + ( + invert_diag, + schema, + pack_ord_if_up, + pack_ord_if_lo, + bmult_id_m, + bmult_id_n, + a, + p, + cntx + ); - // Now p is ready to be packed. + // Return the size needed for memory allocation of the packed buffer. + return size_needed; } -void bli_packm_init_pack( invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - packbuf_t pack_buf_type, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* c, - obj_t* p, - cntx_t* cntx ) +siz_t bli_packm_init_pack + ( + invdiag_t invert_diag, + pack_t schema, + packord_t pack_ord_if_up, + packord_t pack_ord_if_lo, + bszid_t bmult_id_m, + bszid_t bmult_id_n, + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { - num_t dt = bli_obj_datatype( *c ); - trans_t transc = bli_obj_onlytrans_status( *c ); - dim_t m_c = bli_obj_length( *c ); - dim_t n_c = bli_obj_width( *c ); + num_t dt = bli_obj_datatype( *a ); + trans_t transa = bli_obj_onlytrans_status( *a ); + dim_t m_a = bli_obj_length( *a ); + dim_t n_a = bli_obj_width( *a ); dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); - membrk_t* membrk = bli_cntx_get_membrk( cntx ); - - mem_t* mem_p; dim_t m_p, n_p; dim_t m_p_pad, n_p_pad; siz_t size_p; siz_t elem_size_p; inc_t rs_p, cs_p; inc_t is_p; - void* buf; - // We begin by copying the basic fields of c. We do NOT copy the - // pack_mem entry from c because the entry in p may be cached from - // a previous iteration, and thus we don't want to overwrite it. - bli_obj_alias_for_packing( *c, *p ); + // We begin by copying the fields of A. + bli_obj_alias_to( *a, *p ); // Update the dimension fields to explicitly reflect a transposition, // if needed. // Then, clear the conjugation and transposition fields from the object // since matrix packing in BLIS is deemed to take care of all conjugation // and transposition necessary. - // Then, we adjust the properties of p when c needs a transposition. - // We negate the diagonal offset, and if c is upper- or lower-stored, - // we either toggle the uplo of p. - // Finally, if we mark p as dense since we assume that all matrices, + // Then, we adjust the properties of P when A needs a transposition. + // We negate the diagonal offset, and if A is upper- or lower-stored, + // we either toggle the uplo of P. + // Finally, if we mark P as dense since we assume that all matrices, // regardless of structure, will be densified. - bli_obj_set_dims_with_trans( transc, m_c, n_c, *p ); + bli_obj_set_dims_with_trans( transa, m_a, n_a, *p ); bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, *p ); - if ( bli_does_trans( transc ) ) + if ( bli_does_trans( transa ) ) { bli_obj_negate_diag_offset( *p ); - if ( bli_obj_is_upper_or_lower( *c ) ) + if ( bli_obj_is_upper_or_lower( *a ) ) bli_obj_toggle_uplo( *p ); } - // If we are packing micro-panels, mark p as dense. Otherwise, we are + // If we are packing micro-panels, mark P as dense. Otherwise, we are // probably being called in the context of a level-2 operation, in - // which case we do not want to overwrite the uplo field of p (inherited - // from c) with BLIS_DENSE because that information may be needed by + // which case we do not want to overwrite the uplo field of P (inherited + // from A) with BLIS_DENSE because that information may be needed by // the level-2 operation's unblocked variant to decide whether to // execute a "lower" or "upper" branch of code. if ( bli_is_panel_packed( schema ) ) @@ -265,7 +241,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, // Set the invert diagonal field. bli_obj_set_invert_diag( invert_diag, *p ); - // Set the pack status of p to the pack schema prescribed in the control + // Set the pack status of P to the pack schema prescribed in the control // tree node. bli_obj_set_pack_schema( schema, *p ); @@ -273,15 +249,11 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_obj_set_pack_order_if_upper( pack_ord_if_up, *p ); bli_obj_set_pack_order_if_lower( pack_ord_if_lo, *p ); - // Extract the address of the mem_t object within p that will track - // properties of the packed buffer. - mem_p = bli_obj_pack_mem( *p ); - // Compute the dimensions padded by the dimension multiples. These // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. - // We compute them by starting with the effective dimensions of c (now - // in p) and aligning them to the dimension multiples (typically equal + // We compute them by starting with the effective dimensions of A (now + // in P) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. m_p = bli_obj_length( *p ); @@ -295,9 +267,9 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_obj_set_padded_dims( m_p_pad, n_p_pad, *p ); // Now we prepare to compute strides, align them, and compute the - // total number of bytes needed for the packed buffer. After that, - // we will acquire an appropriate block of memory from the memory - // allocator. + // total number of bytes needed for the packed buffer. The caller + // will then use that value to acquire an appropriate block of memory + // from the memory allocator. // Extract the element size for the packed object. elem_size_p = bli_obj_elem_size( *p ); @@ -320,7 +292,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, rs_p = bli_align_dim_to_size( rs_p, elem_size_p, BLIS_HEAP_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides in P. bli_obj_set_strides( rs_p, cs_p, *p ); // Compute the size of the packed buffer. @@ -343,7 +315,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, cs_p = bli_align_dim_to_size( cs_p, elem_size_p, BLIS_HEAP_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides in P. bli_obj_set_strides( rs_p, cs_p, *p ); // Compute the size of the packed buffer. @@ -431,7 +403,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel ); else is_p = 1; - // Store the strides and panel dimension in p. + // Store the strides and panel dimension in P. bli_obj_set_strides( rs_p, cs_p, *p ); bli_obj_set_imag_stride( is_p, *p ); bli_obj_set_panel_dim( m_panel, *p ); @@ -524,7 +496,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel ); else is_p = 1; - // Store the strides and panel dimension in p. + // Store the strides and panel dimension in P. bli_obj_set_strides( rs_p, cs_p, *p ); bli_obj_set_imag_stride( is_p, *p ); bli_obj_set_panel_dim( n_panel, *p ); @@ -547,99 +519,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, size_p = 0; } - - if ( bli_mem_is_unalloc( mem_p ) ) - { - // If the mem_t object of p has not yet been allocated, then acquire - // a memory block of type pack_buf_type. - bli_membrk_acquire_m( membrk, - size_p, - pack_buf_type, - mem_p ); - } - else - { - // If the mem_t object is currently allocated and smaller than is - // needed, then it must have been allocated for a different type - // of object (a different pack_buf_type value), so we must first - // release it and then re-acquire it using the new size and new - // pack_buf_type value. - if ( bli_mem_size( mem_p ) < size_p ) - { - bli_membrk_release( mem_p ); - bli_membrk_acquire_m( membrk, - size_p, - pack_buf_type, - mem_p ); - } - } - - // Grab the buffer address from the mem_t object and copy it to the - // main object buffer field. (Sometimes this buffer address will be - // copied when the value is already up-to-date, because it persists - // in the main object buffer field across loop iterations.) - buf = bli_mem_buffer( mem_p ); - bli_obj_set_buffer( buf, *p ); - + return size_p; } -void bli_packm_release( obj_t* p, - packm_t* cntl ) -{ - if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); -} - - -/* -void bli_packm_init_cast( obj_t* a, - obj_t* p, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) the view offset of c is reset to (0,0), - // (3) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available, (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - // (4) object c is marked as being stored in a standard, contiguous - // format (ie: a column-major order). - // Any transposition encoded within object a will not be handled here, - // but rather will be handled in the packm implementation. That way, - // the only thing castm needs to do is cast. - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t m_a = bli_obj_length( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - inc_t rs_c, cs_c; - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect column-major order - // storage. We start the leading dimension out as m(a) and increment it if - // necessary so that the beginning of each column is aligned. - cs_c = bli_align_dim_to_size( m_a, elem_size_c, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - rs_c = 1; - bli_obj_set_strides( rs_c, cs_c, *c ); -} -*/ - diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index a21956ba2..fe0de52fc 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -32,28 +32,24 @@ */ -void bli_packm_init( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl ); +siz_t bli_packm_init + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ); -void bli_packm_init_pack( invdiag_t invert_diag, - pack_t pack_schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - packbuf_t pack_buf_type, - bszid_t mr_id, - bszid_t nr_id, - obj_t* c, - obj_t* p, - cntx_t* cntx ); - -/* -void bli_packm_init_cast( obj_t* a, - obj_t* p, - obj_t* c ); -*/ - -void bli_packm_release( obj_t* p, - packm_t* cntl ); +siz_t bli_packm_init_pack + ( + invdiag_t invert_diag, + pack_t schema, + packord_t pack_ord_if_up, + packord_t pack_ord_if_lo, + bszid_t bmult_id_m, + bszid_t bmult_id_n, + obj_t* a, + obj_t* p, + cntx_t* cntx + ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 7d55c2a64..22ce70a44 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -34,33 +34,16 @@ #include "blis.h" -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ); - -static FUNCPTR_T vars[6][3] = +void bli_packm_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // unblocked optimized unblocked blocked - { bli_packm_unb_var1, NULL, bli_packm_blk_var1 }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, -}; - -void bli_packm_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl, - thrinfo_t* thread ) -{ - varnum_t n; - impl_t i; - FUNCPTR_T f; + packm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -70,14 +53,6 @@ void bli_packm_int( obj_t* a, // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); - // First check if we are to skip this operation because the control tree - // is NULL. We return without taking any action because a was already - // aliased to p in packm_init(). - if ( bli_cntl_is_noop( cntl ) ) - { - return; - } - // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can return, since by now aliasing has already @@ -101,7 +76,7 @@ void bli_packm_int( obj_t* a, // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) ) + if ( bli_obj_pack_schema( *a ) == bli_cntl_packm_params_pack_schema( cntl ) ) { return; } @@ -113,21 +88,17 @@ void bli_packm_int( obj_t* a, return; } - - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_packm_params_var_func( cntl ); // Invoke the variant with kappa_use. - f( a, - p, - cntx, - thread ); - - // Barrier so that packing is done before computation - bli_thread_obarrier( thread ); + f + ( + a, + p, + cntx, + cntl, + thread + ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 89bd4f0d5..14d006d28 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -32,9 +32,11 @@ */ -void bli_packm_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl, - thrinfo_t* thread ); - +void bli_packm_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c index 47f0dc362..1c1265661 100644 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ b/frame/1m/packm/bli_packm_thrinfo.c @@ -41,7 +41,8 @@ thrinfo_t* bli_packm_thrinfo_create thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ) { thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); @@ -53,9 +54,8 @@ thrinfo_t* bli_packm_thrinfo_create icomm, icomm_id, n_way, work_id, - NULL, - NULL, - NULL + FALSE, + sub_node ); return thread; @@ -69,7 +69,8 @@ void bli_packm_thrinfo_init thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ) { bli_thrinfo_init @@ -78,9 +79,8 @@ void bli_packm_thrinfo_init ocomm, ocomm_id, icomm, icomm_id, n_way, work_id, - NULL, - NULL, - NULL + FALSE, + sub_node ); } @@ -95,7 +95,8 @@ void bli_packm_thrinfo_init_single &BLIS_SINGLE_COMM, 0, &BLIS_SINGLE_COMM, 0, 1, - 0 + 0, + NULL ); } diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 45ab46c3c..7b6d7ae4d 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -49,7 +49,8 @@ thrinfo_t* bli_packm_thrinfo_create thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ); void bli_packm_thrinfo_init @@ -60,7 +61,8 @@ void bli_packm_thrinfo_init thrcomm_t* icomm, dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c index 75e999320..49b3a918a 100644 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ b/frame/1m/packm/bli_packm_unb_var1.c @@ -55,10 +55,14 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); -void bli_packm_unb_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* thread ) +void bli_packm_unb_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_cp = bli_obj_datatype( *c ); diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h index 3d737d483..cefd4de94 100644 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ b/frame/1m/packm/bli_packm_unb_var1.h @@ -32,10 +32,14 @@ */ -void bli_packm_unb_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* thread ); +void bli_packm_unb_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); #undef GENTPROT diff --git a/frame/3/trsm/old/bli_trsm_cntx.h b/frame/1m/scalm/bli_scalm.h similarity index 96% rename from frame/3/trsm/old/bli_trsm_cntx.h rename to frame/1m/scalm/bli_scalm.h index 0bdc9e7a8..303ec3860 100644 --- a/frame/3/trsm/old/bli_trsm_cntx.h +++ b/frame/1m/scalm/bli_scalm.h @@ -32,6 +32,5 @@ */ -void bli_trsm_cntx_init( void ); -void bli_trsm_cntx_finalize( void ); +#include "bli_scalm_cntl.h" diff --git a/frame/1m/scalm/bli_scalm_cntl.c b/frame/1m/scalm/bli_scalm_cntl.c index 4a965b3fa..f6008a9a3 100644 --- a/frame/1m/scalm/bli_scalm_cntl.c +++ b/frame/1m/scalm/bli_scalm_cntl.c @@ -34,38 +34,25 @@ #include "blis.h" -scalm_t* scalm_cntl = NULL; - -void bli_scalm_cntl_init() +cntl_t* bli_scalm_cntl_obj_create + ( + void* var_func, + cntl_t* sub_node + ) { - scalm_cntl = bli_scalm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1 ); -} + cntl_t* cntl; -void bli_scalm_cntl_finalize() -{ - bli_cntl_obj_free( scalm_cntl ); -} - - -scalm_t* bli_scalm_cntl_obj_create( impl_t impl_type, - varnum_t var_num ) -{ - scalm_t* cntl; - - cntl = ( scalm_t* ) bli_malloc_intl( sizeof(scalm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + NULL, + sub_node + ); return cntl; } - -void bli_scalm_cntl_obj_init( scalm_t* cntl, - impl_t impl_type, - varnum_t var_num ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; -} - diff --git a/frame/1m/scalm/bli_scalm_cntl.h b/frame/1m/scalm/bli_scalm_cntl.h index ccda9217e..4029a4f10 100644 --- a/frame/1m/scalm/bli_scalm_cntl.h +++ b/frame/1m/scalm/bli_scalm_cntl.h @@ -32,20 +32,9 @@ */ -struct scalm_s -{ - impl_t impl_type; - varnum_t var_num; -}; -typedef struct scalm_s scalm_t; - -#define bli_cntl_sub_scalm( cntl ) cntl->sub_scalm - -void bli_scalm_cntl_init( void ); -void bli_scalm_cntl_finalize( void ); -scalm_t* bli_scalm_cntl_obj_create( impl_t impl_type, - varnum_t var_num ); -void bli_scalm_cntl_obj_init( scalm_t* cntl, - impl_t impl_type, - varnum_t var_num ); +cntl_t* bli_scalm_cntl_obj_create + ( + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/1m/scalm/bli_scalm_int.c b/frame/1m/scalm/other/bli_scalm_int.c similarity index 100% rename from frame/1m/scalm/bli_scalm_int.c rename to frame/1m/scalm/other/bli_scalm_int.c diff --git a/frame/1m/scalm/bli_scalm_int.h b/frame/1m/scalm/other/bli_scalm_int.h similarity index 100% rename from frame/1m/scalm/bli_scalm_int.h rename to frame/1m/scalm/other/bli_scalm_int.h diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index 8254f5043..e300cb66f 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -37,8 +37,7 @@ #include "bli_unpackm_int.h" #include "bli_unpackm_unb_var1.h" -//#include "bli_unpackm_blk_var1.h" -#include "bli_unpackm_blk_var2.h" +#include "bli_unpackm_blk_var1.h" #include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c similarity index 96% rename from frame/1m/unpackm/bli_unpackm_blk_var2.c rename to frame/1m/unpackm/bli_unpackm_blk_var1.c index ab2c2cf1c..bb9f0ee22 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c @@ -52,13 +52,17 @@ typedef void (*FUNCPTR_T)( cntx_t* cntx ); -static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var2); +static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1); -void bli_unpackm_blk_var2( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_blk_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -266,5 +270,5 @@ void PASTEMAC(ch,varname) \ \ } -INSERT_GENTFUNC_BASIC0( unpackm_blk_var2 ) +INSERT_GENTFUNC_BASIC0( unpackm_blk_var1 ) diff --git a/frame/3/trmm/old/bli_trmm_thread.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h similarity index 71% rename from frame/3/trmm/old/bli_trmm_thread.h rename to frame/1m/unpackm/bli_unpackm_blk_var1.h index bedc7781f..330e9b089 100644 --- a/frame/3/trmm/old/bli_trmm_thread.h +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h @@ -32,14 +32,35 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +void bli_unpackm_blk_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); -#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -//thrinfo_t** bli_trmm_thrinfo_create_paths( bool_t jc_dependency ); +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + dim_t m, \ + dim_t n, \ + dim_t m_panel, \ + dim_t n_panel, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROT_BASIC( unpackm_blk_var1 ) diff --git a/frame/1m/unpackm/bli_unpackm_check.c b/frame/1m/unpackm/bli_unpackm_check.c index 87af08f43..0ffa984b2 100644 --- a/frame/1m/unpackm/bli_unpackm_check.c +++ b/frame/1m/unpackm/bli_unpackm_check.c @@ -34,10 +34,12 @@ #include "blis.h" -void bli_unpackm_check( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_int_check + ( + obj_t* p, + obj_t* a, + cntx_t* cntx + ) { err_t e_val; diff --git a/frame/1m/unpackm/bli_unpackm_check.h b/frame/1m/unpackm/bli_unpackm_check.h index 217b03c4a..889dd7831 100644 --- a/frame/1m/unpackm/bli_unpackm_check.h +++ b/frame/1m/unpackm/bli_unpackm_check.h @@ -32,7 +32,10 @@ */ -void bli_unpackm_check( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ); +void bli_unpackm_int_check + ( + obj_t* p, + obj_t* a, + cntx_t* cntx + ); + diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 0e99bb741..2900cb3b8 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -34,42 +34,35 @@ #include "blis.h" -unpackm_t* unpackm_cntl = NULL; - -void bli_unpackm_cntl_init() +cntl_t* bli_unpackm_cntl_obj_create + ( + void* var_func, + void* unpackm_var_func, + cntl_t* sub_node + ) { - unpackm_cntl = bli_unpackm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, - NULL ); // no blocksize needed -} + cntl_t* cntl; + unpackm_params_t* params; -void bli_unpackm_cntl_finalize() -{ - bli_cntl_obj_free( unpackm_cntl ); -} + // Allocate an unpackm_params_t struct. + params = bli_malloc_intl( sizeof( unpackm_params_t ) ); -unpackm_t* bli_unpackm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - blksz_t* b ) -{ - unpackm_t* cntl; + // Initialize the unpackm_params_t struct. + params->size = sizeof( unpackm_params_t ); + params->var_func = unpackm_var_func; - cntl = ( unpackm_t* ) bli_malloc_intl( sizeof(unpackm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->b = b; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); return cntl; } -void bli_unpackm_cntl_obj_init( unpackm_t* cntl, - impl_t impl_type, - varnum_t var_num, - blksz_t* b ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->b = b; -} - diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 8a3935ba4..82d9727fc 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -32,28 +32,23 @@ */ -struct unpackm_s +struct unpackm_params_s { - impl_t impl_type; - varnum_t var_num; - blksz_t* b; + uint64_t size; // size field must be present and come first. + unpackm_voft var_func; }; -typedef struct unpackm_s unpackm_t; +typedef struct unpackm_params_s unpackm_params_t; -#define bli_cntl_sub_unpackm( cntl ) cntl->sub_unpackm -#define bli_cntl_sub_unpackm_a( cntl ) cntl->sub_unpackm_a -#define bli_cntl_sub_unpackm_a11( cntl ) cntl->sub_unpackm_a11 -#define bli_cntl_sub_unpackm_b( cntl ) cntl->sub_unpackm_b -#define bli_cntl_sub_unpackm_b11( cntl ) cntl->sub_unpackm_b11 -#define bli_cntl_sub_unpackm_c( cntl ) cntl->sub_unpackm_c -#define bli_cntl_sub_unpackm_c11( cntl ) cntl->sub_unpackm_c11 +#define bli_cntl_unpackm_params_var_func( cntl ) \ +\ + ( ( (unpackm_params_t*)(cntl)->params )->var_func ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_unpackm_cntl_obj_create + ( + void* var_func, + void* unpackm_var_func, + cntl_t* sub_node + ); -void bli_unpackm_cntl_init( void ); -void bli_unpackm_cntl_finalize( void ); -unpackm_t* bli_unpackm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - blksz_t* b ); -void bli_unpackm_cntl_obj_init( unpackm_t* cntl, - impl_t impl_type, - varnum_t var_num, - blksz_t* b ); diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/unpackm/bli_unpackm_cxk.c index a31a7f9dc..0ffaa78e5 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.c +++ b/frame/1m/unpackm/bli_unpackm_cxk.c @@ -152,15 +152,16 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTEMAC(ch,opname)( \ - conj_t conjp, \ - dim_t m, \ - dim_t n, \ - void* beta, \ - void* p, inc_t ldp, \ - void* a, inc_t inca, inc_t lda, \ - cntx_t* cntx \ - ) \ +void PASTEMAC(ch,opname) \ + ( \ + conj_t conjp, \ + dim_t m, \ + dim_t n, \ + void* beta, \ + void* p, inc_t ldp, \ + void* a, inc_t inca, inc_t lda, \ + cntx_t* cntx \ + ) \ { \ dim_t panel_dim; \ num_t dt; \ diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index 62b2b3530..b76d325b9 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -34,188 +34,43 @@ #include "blis.h" -#define FUNCPTR_T unpackm_fp - -typedef void (*FUNCPTR_T)( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ); - -static FUNCPTR_T vars[2][3] = +void bli_unpackm_int + ( + obj_t* p, + obj_t* a, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // unblocked optimized unblocked blocked - { bli_unpackm_unb_var1, NULL, NULL, }, - { NULL, NULL, bli_unpackm_blk_var2, }, -}; + unpackm_voft f; -void bli_unpackm_int( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl, - thrinfo_t* thread ) -{ - // The unpackm operation consists of an optional post-process: castm. - // (This post-process is analogous to the castm pre-process in packm.) - // Here are the following possible ways unpackm can execute: - // 1. unpack and cast: Unpack to a temporary matrix c and then cast - // c to a. - // 2. unpack only: Unpack directly to matrix a since typecasting is - // not needed. - // 3. cast only: Not yet supported / not used. - // 4. no-op: The control tree directs us to skip the unpack operation - // entirely. No action is taken. - - obj_t c; - - varnum_t n; - impl_t i; - FUNCPTR_T f; - - // Sanity check; A should never have a zero dimension. If we must support - // it, then we should fold it into the next alias-and-early-exit block. - //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); - - // First check if we are to skip this operation because the control tree - // is NULL, and if so, simply return. - if ( bli_cntl_is_noop( cntl ) ) - { - return; - } + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_unpackm_int_check( p, a, cntx ); // If p was aliased to a during the pack stage (because it was already // in an acceptable packed/contiguous format), then no unpack is actually // necessary, so we return. - if ( bli_obj_is_alias_of( *p, *a ) ) - { - return; - } + if ( bli_obj_is_alias_of( *p, *a ) ) return; - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_unpackm_check( p, a, cntx, cntl ); - - // Now, if we are not skipping the unpack operation, then the only - // question left is whether we are to typecast matrix a after unpacking. - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) - bli_abort(); -/* - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) - { - // Initialize an object c for the intermediate typecast matrix. - bli_unpackm_init_cast( p, - a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // matrix serves as a minor optimization. This causes the unpackm - // implementation to unpack directly into matrix a. - bli_obj_alias_to( *a, c ); - } - - // Now we are ready to proceed with the unpacking. - - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_unpackm_params_var_func( cntl ); // Invoke the variant. - if( bli_thread_am_ochief( thread ) ) { - f( p, - &c, - cntx, - cntl ); - } - bli_thread_obarrier( thread ); - - // Now, if necessary, we cast the contents of c to matrix a. If casting - // was not necessary, then we are done because the call to the unpackm - // implementation would have unpacked directly to matrix a. -/* - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) + if ( bli_thread_am_ochief( thread ) ) { - // Copy/typecast matrix c to matrix a. - // NOTE: Here, we use copynzm instead of copym because, in the cases - // where we are unpacking/typecasting a real matrix c to a complex - // matrix a, we want to touch only the real components of a, rather - // than also set the imaginary components to zero. This comes about - // because of the fact that, if we are unpacking real-to-complex, - // then it is because all of the computation occurred in the real - // domain, and so we would want to leave whatever imaginary values - // there are in matrix a untouched. Notice that for unpackings that - // entail complex-to-complex data movements, the copynzm operation - // behaves exactly as copym, so no use cases are lost (at least none - // that I can think of). - bli_copynzm( &c, - a ); + f + ( + p, + a, + cntx, + cntl, + thread + ); + } - // NOTE: The above code/comment is outdated. What should happen is - // as follows: - // - If dt(a) is complex and dt(p) is real, then create an alias of - // a and then tweak it so that it looks like a real domain object. - // This will involve: - // - projecting the datatype to real domain - // - scaling both the row and column strides by 2 - // ALL OF THIS should be done in the front-end, NOT here, as - // unpackm() won't even be needed in that case. - } -*/ + // Barrier so that unpacking is done before computation. + bli_thread_obarrier( thread ); } -/* -void bli_unpackm_init_cast( obj_t* p, - obj_t* a, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) the view offset of c is reset to (0,0), - // (3) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available, (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - // (4) object c is marked as being stored in a standard, contiguous - // format (ie: column-major order). - // Any transposition encoded within object a will also be encoded in - // object c. That way, unpackm handles any needed transposition during - // the unpacking, and the only thing the cast stage needs to do is cast. - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t m_a = bli_obj_length( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - - inc_t rs_c, cs_c; - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect column-major order - // storage. We start the leading dimension out as m(a) and increment it if - // necessary so that the beginning of each column is aligned. - cs_c = bli_align_dim_to_size( m_a, elem_size_c, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - rs_c = 1; - bli_obj_set_strides( rs_c, cs_c, *c ); -} -*/ diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h index 6e7a26a13..26cf7877b 100644 --- a/frame/1m/unpackm/bli_unpackm_int.h +++ b/frame/1m/unpackm/bli_unpackm_int.h @@ -32,14 +32,12 @@ */ -void bli_unpackm_int( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl, - thrinfo_t* thread ); +void bli_unpackm_int + ( + obj_t* p, + obj_t* a, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); -/* -void bli_unpackm_init_cast( obj_t* p, - obj_t* a, - obj_t* c ); -*/ diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.c b/frame/1m/unpackm/bli_unpackm_unb_var1.c index 0794f6c4f..9e86a78de 100644 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.c +++ b/frame/1m/unpackm/bli_unpackm_unb_var1.c @@ -50,10 +50,14 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1); -void bli_unpackm_unb_var1( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_unb_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_pc = bli_obj_datatype( *p ); diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.h b/frame/1m/unpackm/bli_unpackm_unb_var1.h index fcb98bda5..40c921522 100644 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.h +++ b/frame/1m/unpackm/bli_unpackm_unb_var1.h @@ -32,10 +32,14 @@ */ -void bli_unpackm_unb_var1( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ); +void bli_unpackm_unb_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ diff --git a/frame/2/gemv/bli_gemv.h b/frame/2/gemv/bli_gemv.h index b7c39613c..b4c6b4816 100644 --- a/frame/2/gemv/bli_gemv.h +++ b/frame/2/gemv/bli_gemv.h @@ -32,9 +32,10 @@ */ -#include "bli_gemv_cntl.h" -#include "bli_gemv_front.h" -#include "bli_gemv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_gemv_cntl.h" +//#include "bli_gemv_front.h" +//#include "bli_gemv_int.h" #include "bli_gemv_var.h" diff --git a/frame/2/gemv/bli_gemv_var.h b/frame/2/gemv/bli_gemv_var.h index 9dd3f5d71..4e2a03908 100644 --- a/frame/2/gemv/bli_gemv_var.h +++ b/frame/2/gemv/bli_gemv_var.h @@ -48,7 +48,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - gemv_t* cntl \ + cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) diff --git a/frame/2/gemv/bli_gemv_var_oapi.c b/frame/2/gemv/bli_gemv_var_oapi.c index 6d27452c2..f1662c922 100644 --- a/frame/2/gemv/bli_gemv_var_oapi.c +++ b/frame/2/gemv/bli_gemv_var_oapi.c @@ -45,7 +45,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - gemv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/3/trsm/old/bli_trsm_cntx.c b/frame/2/gemv/old/bli_gemv_var_oapi.c.prev similarity index 52% rename from frame/3/trsm/old/bli_trsm_cntx.c rename to frame/2/gemv/old/bli_gemv_var_oapi.c.prev index 186c146df..771cfbf12 100644 --- a/frame/3/trsm/old/bli_trsm_cntx.c +++ b/frame/2/gemv/old/bli_gemv_var_oapi.c.prev @@ -34,43 +34,64 @@ #include "blis.h" -void bli_trsm_cntx_init( cntx_t* cntx ) -{ - // Perform basic setup on the context. - bli_cntx_obj_create( cntx ); +#undef GENFRONT +#define GENFRONT( ftname, opname ) \ +\ +/*static gemv_vft GENARRAY(ftypes,gemv_unb_var1);*/ \ +static GENARRAY_VFP(ftname,opname); \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y, \ + cntx_t* cntx, \ + gemv_t* cntl \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *a ); \ +\ + trans_t transa = bli_obj_conjtrans_status( *a ); \ + conj_t conjx = bli_obj_conj_status( *x ); \ +\ + dim_t m = bli_obj_length( *a ); \ + dim_t n = bli_obj_width( *a ); \ +\ + void* buf_a = bli_obj_buffer_at_off( *a ); \ + inc_t rs_a = bli_obj_row_stride( *a ); \ + inc_t cs_a = bli_obj_col_stride( *a ); \ +\ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_y = bli_obj_buffer_at_off( *y ); \ + inc_t incy = bli_obj_vector_inc( *y ); \ +\ + void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); \ + void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta ); \ +\ + PASTECH(ftname,_vft) f = PASTECH(opname,_vfp)[dt]; \ +\ + /* Invoke the void pointer-based function for the given datatype. */ \ + f( \ + transa, \ + conjx, \ + m, \ + n, \ + buf_alpha, \ + buf_a, rs_a, cs_a, \ + buf_x, incx, \ + buf_beta, \ + buf_y, incy, \ + cntx \ + ); \ +} \ - // Initialize the context with the current architecture's native - // level-3 gemm micro-kernel, and its output preferences. - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); +GENFRONT( gemv, gemv_unb_var1 ) +GENFRONT( gemv, gemv_unb_var2 ) - // Initialize the context with the current architecture's native - // level-3 trsm micro-kernels. - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMMTRSM_L_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMMTRSM_U_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_L_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx ); - - // Initialize the context with the current architecture's register - // and cache blocksizes (and multiples), given the execution method. - bli_gks_cntx_set_blkszs( BLIS_NAT, 6, - BLIS_NC, BLIS_NR, - BLIS_KC, BLIS_KR, - BLIS_MC, BLIS_MR, - BLIS_NR, BLIS_NR, - BLIS_MR, BLIS_MR, - BLIS_KR, BLIS_KR, - cntx ); - - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); -} - -void bli_trsm_cntx_finalize( cntx_t* cntx ) -{ - // Free the context and all memory allocated to it. - bli_cntx_obj_free( cntx ); -} +GENFRONT( gemv, gemv_unf_var1 ) +GENFRONT( gemv, gemv_unf_var2 ) diff --git a/frame/2/gemv/bli_gemv_blk_var1.c b/frame/2/gemv/other/bli_gemv_blk_var1.c similarity index 100% rename from frame/2/gemv/bli_gemv_blk_var1.c rename to frame/2/gemv/other/bli_gemv_blk_var1.c diff --git a/frame/2/gemv/bli_gemv_blk_var2.c b/frame/2/gemv/other/bli_gemv_blk_var2.c similarity index 100% rename from frame/2/gemv/bli_gemv_blk_var2.c rename to frame/2/gemv/other/bli_gemv_blk_var2.c diff --git a/frame/2/gemv/bli_gemv_cntl.c b/frame/2/gemv/other/bli_gemv_cntl.c similarity index 100% rename from frame/2/gemv/bli_gemv_cntl.c rename to frame/2/gemv/other/bli_gemv_cntl.c diff --git a/frame/2/gemv/bli_gemv_cntl.h b/frame/2/gemv/other/bli_gemv_cntl.h similarity index 100% rename from frame/2/gemv/bli_gemv_cntl.h rename to frame/2/gemv/other/bli_gemv_cntl.h diff --git a/frame/2/gemv/bli_gemv_front.c b/frame/2/gemv/other/bli_gemv_front.c similarity index 100% rename from frame/2/gemv/bli_gemv_front.c rename to frame/2/gemv/other/bli_gemv_front.c diff --git a/frame/2/gemv/bli_gemv_front.h b/frame/2/gemv/other/bli_gemv_front.h similarity index 100% rename from frame/2/gemv/bli_gemv_front.h rename to frame/2/gemv/other/bli_gemv_front.h diff --git a/frame/2/gemv/bli_gemv_int.c b/frame/2/gemv/other/bli_gemv_int.c similarity index 100% rename from frame/2/gemv/bli_gemv_int.c rename to frame/2/gemv/other/bli_gemv_int.c diff --git a/frame/2/gemv/bli_gemv_int.h b/frame/2/gemv/other/bli_gemv_int.h similarity index 100% rename from frame/2/gemv/bli_gemv_int.h rename to frame/2/gemv/other/bli_gemv_int.h diff --git a/frame/2/ger/bli_ger.h b/frame/2/ger/bli_ger.h index dc6f9e3f9..1d92502a3 100644 --- a/frame/2/ger/bli_ger.h +++ b/frame/2/ger/bli_ger.h @@ -32,8 +32,9 @@ */ -#include "bli_ger_cntl.h" -#include "bli_ger_front.h" -#include "bli_ger_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_ger_cntl.h" +//#include "bli_ger_front.h" +//#include "bli_ger_int.h" #include "bli_ger_var.h" diff --git a/frame/2/ger/bli_ger_var.h b/frame/2/ger/bli_ger_var.h index 5833ec3f4..98451dcae 100644 --- a/frame/2/ger/bli_ger_var.h +++ b/frame/2/ger/bli_ger_var.h @@ -47,7 +47,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ - ger_t* cntl \ + cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) diff --git a/frame/2/ger/bli_ger_var_oapi.c b/frame/2/ger/bli_ger_var_oapi.c index f03452dce..5c4aa113f 100644 --- a/frame/2/ger/bli_ger_var_oapi.c +++ b/frame/2/ger/bli_ger_var_oapi.c @@ -44,7 +44,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ - ger_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/ger/bli_ger_blk_var1.c b/frame/2/ger/other/bli_ger_blk_var1.c similarity index 100% rename from frame/2/ger/bli_ger_blk_var1.c rename to frame/2/ger/other/bli_ger_blk_var1.c diff --git a/frame/2/ger/bli_ger_blk_var2.c b/frame/2/ger/other/bli_ger_blk_var2.c similarity index 100% rename from frame/2/ger/bli_ger_blk_var2.c rename to frame/2/ger/other/bli_ger_blk_var2.c diff --git a/frame/2/ger/bli_ger_cntl.c b/frame/2/ger/other/bli_ger_cntl.c similarity index 100% rename from frame/2/ger/bli_ger_cntl.c rename to frame/2/ger/other/bli_ger_cntl.c diff --git a/frame/2/ger/bli_ger_cntl.h b/frame/2/ger/other/bli_ger_cntl.h similarity index 100% rename from frame/2/ger/bli_ger_cntl.h rename to frame/2/ger/other/bli_ger_cntl.h diff --git a/frame/2/ger/bli_ger_front.c b/frame/2/ger/other/bli_ger_front.c similarity index 100% rename from frame/2/ger/bli_ger_front.c rename to frame/2/ger/other/bli_ger_front.c diff --git a/frame/2/ger/bli_ger_front.h b/frame/2/ger/other/bli_ger_front.h similarity index 100% rename from frame/2/ger/bli_ger_front.h rename to frame/2/ger/other/bli_ger_front.h diff --git a/frame/2/ger/bli_ger_int.c b/frame/2/ger/other/bli_ger_int.c similarity index 100% rename from frame/2/ger/bli_ger_int.c rename to frame/2/ger/other/bli_ger_int.c diff --git a/frame/2/ger/bli_ger_int.h b/frame/2/ger/other/bli_ger_int.h similarity index 100% rename from frame/2/ger/bli_ger_int.h rename to frame/2/ger/other/bli_ger_int.h diff --git a/frame/2/hemv/bli_hemv.h b/frame/2/hemv/bli_hemv.h index 07b5ff0c0..7ac4b0b13 100644 --- a/frame/2/hemv/bli_hemv.h +++ b/frame/2/hemv/bli_hemv.h @@ -32,9 +32,10 @@ */ -#include "bli_hemv_cntl.h" -#include "bli_hemv_front.h" -#include "bli_hemv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_hemv_cntl.h" +//#include "bli_hemv_front.h" +//#include "bli_hemv_int.h" #include "bli_hemv_var.h" diff --git a/frame/2/hemv/bli_hemv_var.h b/frame/2/hemv/bli_hemv_var.h index cf0e25bd4..db00df441 100644 --- a/frame/2/hemv/bli_hemv_var.h +++ b/frame/2/hemv/bli_hemv_var.h @@ -49,7 +49,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - hemv_t* cntl \ + cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) diff --git a/frame/2/hemv/bli_hemv_var_oapi.c b/frame/2/hemv/bli_hemv_var_oapi.c index c0fc00ad4..a73dbe9b3 100644 --- a/frame/2/hemv/bli_hemv_var_oapi.c +++ b/frame/2/hemv/bli_hemv_var_oapi.c @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - hemv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/hemv/bli_hemv_blk_var1.c b/frame/2/hemv/other/bli_hemv_blk_var1.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var1.c rename to frame/2/hemv/other/bli_hemv_blk_var1.c diff --git a/frame/2/hemv/bli_hemv_blk_var2.c b/frame/2/hemv/other/bli_hemv_blk_var2.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var2.c rename to frame/2/hemv/other/bli_hemv_blk_var2.c diff --git a/frame/2/hemv/bli_hemv_blk_var3.c b/frame/2/hemv/other/bli_hemv_blk_var3.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var3.c rename to frame/2/hemv/other/bli_hemv_blk_var3.c diff --git a/frame/2/hemv/bli_hemv_blk_var4.c b/frame/2/hemv/other/bli_hemv_blk_var4.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var4.c rename to frame/2/hemv/other/bli_hemv_blk_var4.c diff --git a/frame/2/hemv/bli_hemv_cntl.c b/frame/2/hemv/other/bli_hemv_cntl.c similarity index 100% rename from frame/2/hemv/bli_hemv_cntl.c rename to frame/2/hemv/other/bli_hemv_cntl.c diff --git a/frame/2/hemv/bli_hemv_cntl.h b/frame/2/hemv/other/bli_hemv_cntl.h similarity index 100% rename from frame/2/hemv/bli_hemv_cntl.h rename to frame/2/hemv/other/bli_hemv_cntl.h diff --git a/frame/2/hemv/bli_hemv_front.c b/frame/2/hemv/other/bli_hemv_front.c similarity index 100% rename from frame/2/hemv/bli_hemv_front.c rename to frame/2/hemv/other/bli_hemv_front.c diff --git a/frame/2/hemv/bli_hemv_front.h b/frame/2/hemv/other/bli_hemv_front.h similarity index 100% rename from frame/2/hemv/bli_hemv_front.h rename to frame/2/hemv/other/bli_hemv_front.h diff --git a/frame/2/hemv/bli_hemv_int.c b/frame/2/hemv/other/bli_hemv_int.c similarity index 100% rename from frame/2/hemv/bli_hemv_int.c rename to frame/2/hemv/other/bli_hemv_int.c diff --git a/frame/2/hemv/bli_hemv_int.h b/frame/2/hemv/other/bli_hemv_int.h similarity index 100% rename from frame/2/hemv/bli_hemv_int.h rename to frame/2/hemv/other/bli_hemv_int.h diff --git a/frame/2/her/bli_her.h b/frame/2/her/bli_her.h index fe9d2d84e..a9a53d569 100644 --- a/frame/2/her/bli_her.h +++ b/frame/2/her/bli_her.h @@ -32,8 +32,9 @@ */ -#include "bli_her_cntl.h" -#include "bli_her_front.h" -#include "bli_her_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_her_cntl.h" +//#include "bli_her_front.h" +//#include "bli_her_int.h" #include "bli_her_var.h" diff --git a/frame/2/her/bli_her_var.h b/frame/2/her/bli_her_var.h index 3e65e2bc4..d4c11a0b5 100644 --- a/frame/2/her/bli_her_var.h +++ b/frame/2/her/bli_her_var.h @@ -47,7 +47,7 @@ void PASTEMAC0(opname) \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ - her_t* cntl \ + cntl_t* cntl \ ); GENPROT( her_blk_var1 ) diff --git a/frame/2/her/bli_her_var_oapi.c b/frame/2/her/bli_her_var_oapi.c index a49cf62e0..3567de196 100644 --- a/frame/2/her/bli_her_var_oapi.c +++ b/frame/2/her/bli_her_var_oapi.c @@ -44,7 +44,7 @@ void PASTEMAC0(opname) \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ - her_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *c ); \ diff --git a/frame/2/her/bli_her_blk_var1.c b/frame/2/her/other/bli_her_blk_var1.c similarity index 100% rename from frame/2/her/bli_her_blk_var1.c rename to frame/2/her/other/bli_her_blk_var1.c diff --git a/frame/2/her/bli_her_blk_var2.c b/frame/2/her/other/bli_her_blk_var2.c similarity index 100% rename from frame/2/her/bli_her_blk_var2.c rename to frame/2/her/other/bli_her_blk_var2.c diff --git a/frame/2/her/bli_her_cntl.c b/frame/2/her/other/bli_her_cntl.c similarity index 100% rename from frame/2/her/bli_her_cntl.c rename to frame/2/her/other/bli_her_cntl.c diff --git a/frame/2/her/bli_her_cntl.h b/frame/2/her/other/bli_her_cntl.h similarity index 100% rename from frame/2/her/bli_her_cntl.h rename to frame/2/her/other/bli_her_cntl.h diff --git a/frame/2/her/bli_her_front.c b/frame/2/her/other/bli_her_front.c similarity index 100% rename from frame/2/her/bli_her_front.c rename to frame/2/her/other/bli_her_front.c diff --git a/frame/2/her/bli_her_front.h b/frame/2/her/other/bli_her_front.h similarity index 100% rename from frame/2/her/bli_her_front.h rename to frame/2/her/other/bli_her_front.h diff --git a/frame/2/her/bli_her_int.c b/frame/2/her/other/bli_her_int.c similarity index 100% rename from frame/2/her/bli_her_int.c rename to frame/2/her/other/bli_her_int.c diff --git a/frame/2/her/bli_her_int.h b/frame/2/her/other/bli_her_int.h similarity index 100% rename from frame/2/her/bli_her_int.h rename to frame/2/her/other/bli_her_int.h diff --git a/frame/2/her2/bli_her2.h b/frame/2/her2/bli_her2.h index 273b6841e..acf55b7e2 100644 --- a/frame/2/her2/bli_her2.h +++ b/frame/2/her2/bli_her2.h @@ -32,8 +32,9 @@ */ -#include "bli_her2_cntl.h" -#include "bli_her2_front.h" -#include "bli_her2_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_her2_cntl.h" +//#include "bli_her2_front.h" +//#include "bli_her2_int.h" #include "bli_her2_var.h" diff --git a/frame/2/her2/bli_her2_var.h b/frame/2/her2/bli_her2_var.h index 301b6931e..5df14c9d1 100644 --- a/frame/2/her2/bli_her2_var.h +++ b/frame/2/her2/bli_her2_var.h @@ -49,7 +49,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ - her2_t* cntl \ + cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) diff --git a/frame/2/her2/bli_her2_var_oapi.c b/frame/2/her2/bli_her2_var_oapi.c index 6c87496d6..ff345555e 100644 --- a/frame/2/her2/bli_her2_var_oapi.c +++ b/frame/2/her2/bli_her2_var_oapi.c @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ - her2_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *c ); \ diff --git a/frame/2/her2/bli_her2_blk_var1.c b/frame/2/her2/other/bli_her2_blk_var1.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var1.c rename to frame/2/her2/other/bli_her2_blk_var1.c diff --git a/frame/2/her2/bli_her2_blk_var2.c b/frame/2/her2/other/bli_her2_blk_var2.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var2.c rename to frame/2/her2/other/bli_her2_blk_var2.c diff --git a/frame/2/her2/bli_her2_blk_var3.c b/frame/2/her2/other/bli_her2_blk_var3.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var3.c rename to frame/2/her2/other/bli_her2_blk_var3.c diff --git a/frame/2/her2/bli_her2_blk_var4.c b/frame/2/her2/other/bli_her2_blk_var4.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var4.c rename to frame/2/her2/other/bli_her2_blk_var4.c diff --git a/frame/2/her2/bli_her2_cntl.c b/frame/2/her2/other/bli_her2_cntl.c similarity index 100% rename from frame/2/her2/bli_her2_cntl.c rename to frame/2/her2/other/bli_her2_cntl.c diff --git a/frame/2/her2/bli_her2_cntl.h b/frame/2/her2/other/bli_her2_cntl.h similarity index 100% rename from frame/2/her2/bli_her2_cntl.h rename to frame/2/her2/other/bli_her2_cntl.h diff --git a/frame/2/her2/bli_her2_front.c b/frame/2/her2/other/bli_her2_front.c similarity index 100% rename from frame/2/her2/bli_her2_front.c rename to frame/2/her2/other/bli_her2_front.c diff --git a/frame/2/her2/bli_her2_front.h b/frame/2/her2/other/bli_her2_front.h similarity index 100% rename from frame/2/her2/bli_her2_front.h rename to frame/2/her2/other/bli_her2_front.h diff --git a/frame/2/her2/bli_her2_int.c b/frame/2/her2/other/bli_her2_int.c similarity index 100% rename from frame/2/her2/bli_her2_int.c rename to frame/2/her2/other/bli_her2_int.c diff --git a/frame/2/her2/bli_her2_int.h b/frame/2/her2/other/bli_her2_int.h similarity index 100% rename from frame/2/her2/bli_her2_int.h rename to frame/2/her2/other/bli_her2_int.h diff --git a/frame/2/symv/bli_symv.h b/frame/2/symv/bli_symv.h index 5195a4c50..8bb1675dc 100644 --- a/frame/2/symv/bli_symv.h +++ b/frame/2/symv/bli_symv.h @@ -32,5 +32,6 @@ */ -#include "bli_symv_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_symv_front.h" diff --git a/frame/2/symv/bli_symv_front.c b/frame/2/symv/other/bli_symv_front.c similarity index 100% rename from frame/2/symv/bli_symv_front.c rename to frame/2/symv/other/bli_symv_front.c diff --git a/frame/2/symv/bli_symv_front.h b/frame/2/symv/other/bli_symv_front.h similarity index 100% rename from frame/2/symv/bli_symv_front.h rename to frame/2/symv/other/bli_symv_front.h diff --git a/frame/2/syr/bli_syr.h b/frame/2/syr/bli_syr.h index 25a5e0a63..897ebe2c5 100644 --- a/frame/2/syr/bli_syr.h +++ b/frame/2/syr/bli_syr.h @@ -32,5 +32,6 @@ */ -#include "bli_syr_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_syr_front.h" diff --git a/frame/2/syr/bli_syr_front.c b/frame/2/syr/other/bli_syr_front.c similarity index 100% rename from frame/2/syr/bli_syr_front.c rename to frame/2/syr/other/bli_syr_front.c diff --git a/frame/2/syr/bli_syr_front.h b/frame/2/syr/other/bli_syr_front.h similarity index 100% rename from frame/2/syr/bli_syr_front.h rename to frame/2/syr/other/bli_syr_front.h diff --git a/frame/2/syr2/bli_syr2.h b/frame/2/syr2/bli_syr2.h index 39d45c6c5..22a9813ea 100644 --- a/frame/2/syr2/bli_syr2.h +++ b/frame/2/syr2/bli_syr2.h @@ -32,5 +32,6 @@ */ -#include "bli_syr2_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_syr2_front.h" diff --git a/frame/2/syr2/bli_syr2_front.c b/frame/2/syr2/other/bli_syr2_front.c similarity index 100% rename from frame/2/syr2/bli_syr2_front.c rename to frame/2/syr2/other/bli_syr2_front.c diff --git a/frame/2/syr2/bli_syr2_front.h b/frame/2/syr2/other/bli_syr2_front.h similarity index 100% rename from frame/2/syr2/bli_syr2_front.h rename to frame/2/syr2/other/bli_syr2_front.h diff --git a/frame/2/trmv/bli_trmv.h b/frame/2/trmv/bli_trmv.h index 242642a91..8410af719 100644 --- a/frame/2/trmv/bli_trmv.h +++ b/frame/2/trmv/bli_trmv.h @@ -32,9 +32,10 @@ */ -#include "bli_trmv_cntl.h" -#include "bli_trmv_front.h" -#include "bli_trmv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_trmv_cntl.h" +//#include "bli_trmv_front.h" +//#include "bli_trmv_int.h" #include "bli_trmv_var.h" diff --git a/frame/2/trmv/bli_trmv_var.h b/frame/2/trmv/bli_trmv_var.h index cca3be140..23680469e 100644 --- a/frame/2/trmv/bli_trmv_var.h +++ b/frame/2/trmv/bli_trmv_var.h @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trmv_t* cntl \ + cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) diff --git a/frame/2/trmv/bli_trmv_var_oapi.c b/frame/2/trmv/bli_trmv_var_oapi.c index 75926054b..b3c0bc147 100644 --- a/frame/2/trmv/bli_trmv_var_oapi.c +++ b/frame/2/trmv/bli_trmv_var_oapi.c @@ -43,7 +43,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trmv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/trmv/bli_trmv_cntl.c b/frame/2/trmv/other/bli_trmv_cntl.c similarity index 100% rename from frame/2/trmv/bli_trmv_cntl.c rename to frame/2/trmv/other/bli_trmv_cntl.c diff --git a/frame/2/trmv/bli_trmv_cntl.h b/frame/2/trmv/other/bli_trmv_cntl.h similarity index 100% rename from frame/2/trmv/bli_trmv_cntl.h rename to frame/2/trmv/other/bli_trmv_cntl.h diff --git a/frame/2/trmv/bli_trmv_front.c b/frame/2/trmv/other/bli_trmv_front.c similarity index 100% rename from frame/2/trmv/bli_trmv_front.c rename to frame/2/trmv/other/bli_trmv_front.c diff --git a/frame/2/trmv/bli_trmv_front.h b/frame/2/trmv/other/bli_trmv_front.h similarity index 100% rename from frame/2/trmv/bli_trmv_front.h rename to frame/2/trmv/other/bli_trmv_front.h diff --git a/frame/2/trmv/bli_trmv_int.c b/frame/2/trmv/other/bli_trmv_int.c similarity index 100% rename from frame/2/trmv/bli_trmv_int.c rename to frame/2/trmv/other/bli_trmv_int.c diff --git a/frame/2/trmv/bli_trmv_int.h b/frame/2/trmv/other/bli_trmv_int.h similarity index 100% rename from frame/2/trmv/bli_trmv_int.h rename to frame/2/trmv/other/bli_trmv_int.h diff --git a/frame/2/trmv/bli_trmv_l_blk_var1.c b/frame/2/trmv/other/bli_trmv_l_blk_var1.c similarity index 100% rename from frame/2/trmv/bli_trmv_l_blk_var1.c rename to frame/2/trmv/other/bli_trmv_l_blk_var1.c diff --git a/frame/2/trmv/bli_trmv_l_blk_var2.c b/frame/2/trmv/other/bli_trmv_l_blk_var2.c similarity index 100% rename from frame/2/trmv/bli_trmv_l_blk_var2.c rename to frame/2/trmv/other/bli_trmv_l_blk_var2.c diff --git a/frame/2/trmv/bli_trmv_u_blk_var1.c b/frame/2/trmv/other/bli_trmv_u_blk_var1.c similarity index 100% rename from frame/2/trmv/bli_trmv_u_blk_var1.c rename to frame/2/trmv/other/bli_trmv_u_blk_var1.c diff --git a/frame/2/trmv/bli_trmv_u_blk_var2.c b/frame/2/trmv/other/bli_trmv_u_blk_var2.c similarity index 100% rename from frame/2/trmv/bli_trmv_u_blk_var2.c rename to frame/2/trmv/other/bli_trmv_u_blk_var2.c diff --git a/frame/2/trsv/bli_trsv.h b/frame/2/trsv/bli_trsv.h index 7b51ed69a..9d9384422 100644 --- a/frame/2/trsv/bli_trsv.h +++ b/frame/2/trsv/bli_trsv.h @@ -32,9 +32,10 @@ */ -#include "bli_trsv_cntl.h" -#include "bli_trsv_front.h" -#include "bli_trsv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_trsv_cntl.h" +//#include "bli_trsv_front.h" +//#include "bli_trsv_int.h" #include "bli_trsv_var.h" diff --git a/frame/2/trsv/bli_trsv_var.h b/frame/2/trsv/bli_trsv_var.h index bc66f49ff..395d89d5d 100644 --- a/frame/2/trsv/bli_trsv_var.h +++ b/frame/2/trsv/bli_trsv_var.h @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trsv_t* cntl \ + cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) diff --git a/frame/2/trsv/bli_trsv_var_oapi.c b/frame/2/trsv/bli_trsv_var_oapi.c index f38a5123f..e26bb3abd 100644 --- a/frame/2/trsv/bli_trsv_var_oapi.c +++ b/frame/2/trsv/bli_trsv_var_oapi.c @@ -43,7 +43,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trsv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/trsv/bli_trsv_cntl.c b/frame/2/trsv/other/bli_trsv_cntl.c similarity index 100% rename from frame/2/trsv/bli_trsv_cntl.c rename to frame/2/trsv/other/bli_trsv_cntl.c diff --git a/frame/2/trsv/bli_trsv_cntl.h b/frame/2/trsv/other/bli_trsv_cntl.h similarity index 100% rename from frame/2/trsv/bli_trsv_cntl.h rename to frame/2/trsv/other/bli_trsv_cntl.h diff --git a/frame/2/trsv/bli_trsv_front.c b/frame/2/trsv/other/bli_trsv_front.c similarity index 100% rename from frame/2/trsv/bli_trsv_front.c rename to frame/2/trsv/other/bli_trsv_front.c diff --git a/frame/2/trsv/bli_trsv_front.h b/frame/2/trsv/other/bli_trsv_front.h similarity index 100% rename from frame/2/trsv/bli_trsv_front.h rename to frame/2/trsv/other/bli_trsv_front.h diff --git a/frame/2/trsv/bli_trsv_int.c b/frame/2/trsv/other/bli_trsv_int.c similarity index 100% rename from frame/2/trsv/bli_trsv_int.c rename to frame/2/trsv/other/bli_trsv_int.c diff --git a/frame/2/trsv/bli_trsv_int.h b/frame/2/trsv/other/bli_trsv_int.h similarity index 100% rename from frame/2/trsv/bli_trsv_int.h rename to frame/2/trsv/other/bli_trsv_int.h diff --git a/frame/2/trsv/bli_trsv_l_blk_var1.c b/frame/2/trsv/other/bli_trsv_l_blk_var1.c similarity index 100% rename from frame/2/trsv/bli_trsv_l_blk_var1.c rename to frame/2/trsv/other/bli_trsv_l_blk_var1.c diff --git a/frame/2/trsv/bli_trsv_l_blk_var2.c b/frame/2/trsv/other/bli_trsv_l_blk_var2.c similarity index 100% rename from frame/2/trsv/bli_trsv_l_blk_var2.c rename to frame/2/trsv/other/bli_trsv_l_blk_var2.c diff --git a/frame/2/trsv/bli_trsv_u_blk_var1.c b/frame/2/trsv/other/bli_trsv_u_blk_var1.c similarity index 100% rename from frame/2/trsv/bli_trsv_u_blk_var1.c rename to frame/2/trsv/other/bli_trsv_u_blk_var1.c diff --git a/frame/2/trsv/bli_trsv_u_blk_var2.c b/frame/2/trsv/other/bli_trsv_u_blk_var2.c similarity index 100% rename from frame/2/trsv/bli_trsv_u_blk_var2.c rename to frame/2/trsv/other/bli_trsv_u_blk_var2.c diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 9f17349af..ea7926d32 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -33,13 +33,17 @@ */ #include "bli_l3_cntx.h" +#include "bli_l3_cntl.h" #include "bli_l3_check.h" #include "bli_l3_ft.h" #include "bli_l3_oft.h" +#include "bli_l3_voft.h" #include "bli_l3_blocksize.h" +#include "bli_l3_direct.h" #include "bli_l3_prune.h" +#include "bli_l3_packm.h" // Prototype object APIs with and without contexts. #include "bli_oapi_w_cntx.h" diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 97556dedd..630cf03a5 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -35,17 +35,78 @@ #include "blis.h" +dim_t bli_l3_determine_kc + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* a, + obj_t* b, + bszid_t bszid, + cntx_t* cntx + ) +{ + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) + return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_HERK ) + return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_TRMM ) + return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_TRSM ) + return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + + // This should never execute. + return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); +} + +// ----------------------------------------------------------------------------- + +// +// NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize +// function to determine the kc blocksize so that we can implement the +// "nudging" of kc to be a multiple of mr or nr, as needed. +// + +#undef GENFRONT +#define GENFRONT( opname, l3op ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ) \ +{ \ + if ( direct == BLIS_FWD ) \ + return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \ + else \ + return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \ +} + +GENFRONT( gemm_determine_kc, gemm ) +GENFRONT( herk_determine_kc, trmm ) +GENFRONT( trmm_determine_kc, trmm ) +GENFRONT( trsm_determine_kc, trsm ) + +// ----------------------------------------------------------------------------- + #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -90,6 +151,8 @@ dim_t PASTEMAC0(opname) \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ } \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -105,12 +168,64 @@ GENFRONT( gemm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ) \ +{ \ + num_t dt; \ + blksz_t* bsize; \ + dim_t b_alg, b_max; \ + dim_t b_use; \ + \ + /* bli_*_determine_kc_f(): + + We assume that this function is being called from an algorithm that + is moving "forward" (ie: top to bottom, left to right, top-left + to bottom-right). */ \ +\ + /* bli_*_determine_kc_b(): + + We assume that this function is being called from an algorithm that + is moving "backward" (ie: bottom to top, right to left, bottom-right + to top-left). */ \ +\ + /* Extract the execution datatype and use it to query the corresponding + blocksize and blocksize maximum values from the blksz_t object. */ \ + dt = bli_obj_execution_datatype( *a ); \ + bsize = bli_cntx_get_blksz( bszid, cntx ); \ + b_alg = bli_blksz_get_def( dt, bsize ); \ + b_max = bli_blksz_get_max( dt, bsize ); \ +\ + /* Notice that for herk, we do not need to perform any special handling + for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ +\ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ + b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ +\ + return b_use; \ +} + +GENFRONT( herk_determine_kc_f, f ) +GENFRONT( herk_determine_kc_b, b ) + +// ----------------------------------------------------------------------------- + +#undef GENFRONT +#define GENFRONT( opname, chdir ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -149,6 +264,8 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -164,12 +281,12 @@ GENFRONT( trmm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -206,6 +323,8 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -214,282 +333,3 @@ dim_t PASTEMAC0(opname) \ GENFRONT( trsm_determine_kc_f, f ) GENFRONT( trsm_determine_kc_b, b ) - - - - - - - - - -#if 0 -dim_t bli_gemm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if A is Hermitian or symmetric, or NR if B is - // Hermitian or symmetric. If neither case applies, then we leave - // the blocksizes unchanged. - if ( bli_obj_root_is_herm_or_symm( *a ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - else if ( bli_obj_root_is_herm_or_symm( *b ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_gemm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if A is Hermitian or symmetric, or NR if B is - // Hermitian or symmetric. If neither case applies, then we leave - // the blocksizes unchanged. - if ( bli_obj_root_is_herm_or_symm( *a ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - else if ( bli_obj_root_is_herm_or_symm( *b ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -// ----------------------------------------------------------------------------- - -dim_t bli_trmm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if the triangular matrix is on the left, or NR - // if the triangular matrix is one the right. - if ( bli_obj_root_is_triangular( *a ) ) - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - else - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_trmm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if the triangular matrix is on the left, or NR - // if the triangular matrix is one the right. - if ( bli_obj_root_is_triangular( *a ) ) - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - else - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -// ----------------------------------------------------------------------------- - -dim_t bli_trsm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR. We always use MR (rather than sometimes using NR) - // because even when the triangle is on the right, packing of that - // matrix uses MR, since only left-side trsm micro-kernels are - // supported. - mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mr ); - b_max = bli_align_dim_to_mult( b_max, mr ); - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_trsm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR. We always use MR (rather than sometimes using NR) - // because even when the triangle is on the right, packing of that - // matrix uses MR, since only left-side trsm micro-kernels are - // supported. - mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mr ); - b_max = bli_align_dim_to_mult( b_max, mr ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -#endif diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 01e10c3fe..8f9f7ad80 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -38,17 +38,42 @@ \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ); + +GENPROT( l3_determine_kc ) + +GENPROT( gemm_determine_kc ) +GENPROT( herk_determine_kc ) +GENPROT( trmm_determine_kc ) +GENPROT( trsm_determine_kc ) + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) +GENPROT( herk_determine_kc_f ) +GENPROT( herk_determine_kc_b ) + GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 48249a9b3..e901f2766 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -226,29 +226,6 @@ void bli_syr2k_check bli_check_error_code( e_val ); } -#if 0 -void bli_trmm_check - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx - ) -{ - err_t e_val; - - // Perform checks common to hemm/symm. - - bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); - - // Check object structure. - - e_val = bli_check_triangular_object( a ); - bli_check_error_code( e_val ); -} -#endif - void bli_trmm_check ( side_t side, diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c new file mode 100644 index 000000000..a8dfee1ba --- /dev/null +++ b/frame/3/bli_l3_cntl.c @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +void bli_l3_cntl_create_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t** cntl_use + ) +{ + // If the control tree pointer is NULL, we construct a default + // tree as a function of the operation family. + if ( cntl_orig == NULL ) + { + opid_t family = bli_cntx_get_family( cntx ); + + if ( family == BLIS_GEMM || + family == BLIS_HERK || + family == BLIS_TRMM ) + { + *cntl_use = bli_gemm_cntl_create( family ); + } + else // if ( family == BLIS_TRSM ) + { + side_t side; + + if ( bli_obj_is_triangular( *a ) ) side = BLIS_LEFT; + else side = BLIS_RIGHT; + + *cntl_use = bli_trsm_cntl_create( side ); + } + } + else + { + // If the user provided a control tree, create a copy and use it + // instead (so that it can be used to cache things like pack mem_t + // entries). + *cntl_use = bli_cntl_copy( cntl_orig ); + } +} + +void bli_l3_cntl_free_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t* cntl_use, + thrinfo_t* thread + ) +{ + // If the control tree pointer is NULL, a default tree would have + // been created, so we now must free it. + if ( cntl_orig == NULL ) + { + opid_t family = bli_cntx_get_family( cntx ); + + if ( family == BLIS_GEMM || + family == BLIS_HERK || + family == BLIS_TRMM ) + { + bli_gemm_cntl_free( cntl_use, thread ); + } + else // if ( family == BLIS_TRSM ) + { + bli_trsm_cntl_free( cntl_use, thread ); + } + } + else + { + // If the user provided a control tree, free the copy of it that + // was created. + bli_cntl_free( cntl_use, thread ); + } +} + diff --git a/frame/3/herk/old/bli_herk_thread.h b/frame/3/bli_l3_cntl.h similarity index 79% rename from frame/3/herk/old/bli_herk_thread.h rename to frame/3/bli_l3_cntl.h index 1feafd113..dc0aeb869 100644 --- a/frame/3/herk/old/bli_herk_thread.h +++ b/frame/3/bli_l3_cntl.h @@ -32,13 +32,29 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm -// For use in herk micro-kernel -#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) +// +// Prototype conditional control tree creation functions. +// -//thrinfo_t** bli_herk_thrinfo_create_paths( void ); +void bli_l3_cntl_create_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t** cntl_use + ); + +void bli_l3_cntl_free_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t* cntl_use, + thrinfo_t* thread + ); diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c new file mode 100644 index 000000000..993501541 --- /dev/null +++ b/frame/3/bli_l3_direct.c @@ -0,0 +1,140 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +dir_t bli_l3_direct + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ) +{ + // Query the operation family. + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); + else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); + else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c ); + else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c ); + + // This should never execute. + return BLIS_FWD; +} + +// ----------------------------------------------------------------------------- + +dir_t bli_gemm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + // For gemm, movement may be forwards (or backwards). + + return BLIS_FWD; +} + +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + // For herk, movement may be forwards (or backwards). + + return BLIS_FWD; +} + +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + // For trmm, movement for the parameter cases is as follows: + // - left,lower: backwards + // - left,upper: forwards + // - right,lower: forwards + // - right,upper: backwards + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + + return direct; +} + +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + // For trsm, movement for the parameter cases is as follows: + // - left,lower: forwards + // - left,upper: backwards + // - right,lower: backwards + // - right,upper: forwards + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + + return direct; +} + diff --git a/frame/3/trsm/old/bli_trsm_thread.h b/frame/3/bli_l3_direct.h similarity index 79% rename from frame/3/trsm/old/bli_trsm_thread.h rename to frame/3/bli_l3_direct.h index 985b6c4a6..7b88ba51f 100644 --- a/frame/3/trsm/old/bli_trsm_thread.h +++ b/frame/3/bli_l3_direct.h @@ -32,11 +32,28 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +dir_t bli_l3_direct + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ); -#define trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +// ----------------------------------------------------------------------------- -//thrinfo_t** bli_trsm_thrinfo_create_paths( bool_t right_sided ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +dir_t PASTEMAC0(opname) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ); + +GENPROT( gemm_direct ) +GENPROT( herk_direct ) +GENPROT( trmm_direct ) +GENPROT( trsm_direct ) diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c new file mode 100644 index 000000000..28fb1f857 --- /dev/null +++ b/frame/3/bli_l3_packm.c @@ -0,0 +1,179 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_l3_packm + ( + obj_t* x, + obj_t* x_pack, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + membrk_t* membrk; + packbuf_t pack_buf_type; + mem_t* cntl_mem_p; + siz_t size_needed; + + // FGVZ: Not sure why we need this barrier, but we do. + bli_thread_obarrier( thread ); + + // Every thread initializes x_pack and determines the size of memory + // block needed (which gets embedded into the otherwise "blank" mem_t + // entry in the control tree node). + size_needed + = + bli_packm_init + ( + x, + x_pack, + cntx, + cntl + ); + + // If zero was returned, no memory needs to be allocated and so we can + // return early. + if ( size_needed == 0 ) return; + + // Query the memory broker from the context. + membrk = bli_cntx_get_membrk( cntx ); + + // Query the pack buffer type from the control tree node. + pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + + // Query the address of the mem_t entry within the control tree node. + cntl_mem_p = bli_cntl_pack_mem( cntl ); + + // Check the mem_t field in the control tree. If it is unallocated, then + // we need to acquire a block from the memory broker and broadcast it to + // all threads in the chief's thread group. + if ( bli_mem_is_unalloc( cntl_mem_p ) ) + { + mem_t* local_mem_p; + mem_t local_mem_s; + + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread acquires a block from the memory broker + // and saves the associated mem_t entry to local_mem_s. + bli_membrk_acquire_m + ( + membrk, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + + // Save the contents of the chief thread's local mem_t entry to the + // mem_t field in this thread's control tree node. + *cntl_mem_p = *local_mem_p; + } + else // ( bli_mem_is_alloc( cntl_mem_p ) ) + { + mem_t* local_mem_p; + mem_t local_mem_s; + + // If the mem_t entry in the control tree does NOT contain a NULL + // buffer, then a block has already been acquired from the memory + // broker and cached in the control tree. + + // BUT, we need to make sure that the mem_t object is not associated + // with a block that is too small given the size of the packed matrix + // that we need, according to the return value from packm_init(). + siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); + + if ( size_needed < cntl_mem_size ) + { + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread releases the existing block associated with + // the mem_t entry in the control tree, and then re-acquires a + // new block, saving the associated mem_t entry to local_mem_s. + bli_membrk_release( cntl_mem_p ); + bli_membrk_acquire_m + ( + membrk, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + + // Save the chief thread's local mem_t entry to the mem_t field in + // this thread's control tree node. + *cntl_mem_p = *local_mem_p; + } + else + { + // If the mem_t entry is already allocated and sufficiently large, + // then we use it as-is. No action is needed, because all threads + // will already have the cached values in their local control + // trees' mem_t entries, currently pointed to by cntl_mem_p. + + bli_thread_obarrier( thread ); + } + } + + + // Update the buffer address in x_pack to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + bli_obj_set_buffer_to_mem( cntl_mem_p, *x_pack ); + + + // Pack the contents of object x to object x_pack. + bli_packm_int + ( + x, + x_pack, + cntx, + cntl, + thread + ); + + // Barrier so that packing is done before computation. + bli_thread_obarrier( thread ); +} + diff --git a/frame/3/trsm/old/bli_trsm_blk_var2f.h b/frame/3/bli_l3_packm.h similarity index 89% rename from frame/3/trsm/old/bli_trsm_blk_var2f.h rename to frame/3/bli_l3_packm.h index 8b5d2dd7e..7dc5dfb46 100644 --- a/frame/3/trsm/old/bli_trsm_blk_var2f.h +++ b/frame/3/bli_l3_packm.h @@ -32,9 +32,14 @@ */ -void bli_trsm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); +#include "blis.h" + +void bli_l3_packm + ( + obj_t* x, + obj_t* x_pack, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index a8c853c56..f908bbb64 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -34,6 +34,86 @@ #include "blis.h" +/* +void bli_l3_prune_unref_mparts_m + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ) +{ + // Query the operation family. + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. + else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); +} +*/ + +#undef GENFRONT +#define GENFRONT( dim ) \ +\ +void PASTEMAC(l3_prune_unref_mparts_,dim) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx \ + ) \ +{ \ + /* Query the operation family. */ \ + opid_t family = bli_cntx_family( cntx ); \ +\ + if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ + else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ +} + +GENFRONT( m ) +GENFRONT( n ) +GENFRONT( k ) + +// ----------------------------------------------------------------------------- + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_prune_unref_mparts_m) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} \ +void PASTEMAC(opname,_prune_unref_mparts_n) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} \ +void PASTEMAC(opname,_prune_unref_mparts_k) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} + +GENFRONT( gemm ) + +// ----------------------------------------------------------------------------- + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index b4870407d..13d661ff1 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -33,6 +33,23 @@ */ +#undef GENPROT +#define GENPROT( dim ) \ +\ +void PASTEMAC(l3_prune_unref_mparts_,dim) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx \ + ); + +GENPROT( m ) +GENPROT( n ) +GENPROT( k ) + +// ----------------------------------------------------------------------------- + #undef GENPROT #define GENPROT( opname, dim ) \ \ @@ -43,6 +60,10 @@ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ obj_t* c \ ); +GENPROT( gemm, m ) +GENPROT( gemm, n ) +GENPROT( gemm, k ) + GENPROT( herk, m ) GENPROT( herk, n ) GENPROT( herk, k ) diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 0bea43e9d..36b65b52b 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -43,9 +43,7 @@ thrinfo_t* bli_l3_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ) { return bli_thrinfo_create @@ -54,9 +52,8 @@ thrinfo_t* bli_l3_thrinfo_create icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + TRUE, + sub_node ); } @@ -69,9 +66,7 @@ void bli_l3_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ) { bli_thrinfo_init @@ -81,9 +76,8 @@ void bli_l3_thrinfo_init icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + TRUE, + sub_node ); } @@ -101,25 +95,37 @@ void bli_l3_thrinfo_free ) { if ( thread == NULL || - thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_HERK_SINGLE_THREADED + thread == &BLIS_PACKM_SINGLE_THREADED || + thread == &BLIS_GEMM_SINGLE_THREADED ) return; - // Free Communicators - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( thread->ocomm ); - if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( thread->icomm ); + thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); - // Free thrinfo chidren - bli_packm_thrinfo_free( thread->opackm ); - bli_packm_thrinfo_free( thread->ipackm ); - bli_l3_thrinfo_free( thread->sub_self ); + // Free the communicators, but only if the current thrinfo_t struct + // is marked as needing them to be freed. The most common example of + // thrinfo_t nodes NOT marked as needing their comms freed are those + // associated with packm thrinfo_t nodes. + if ( bli_thrinfo_needs_free_comms( thread ) ) + { + // The ochief always frees his communicator, and the ichief free its + // communicator if we are at the leaf node. + if ( bli_thread_am_ochief( thread ) ) + bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); + if ( thrinfo_sub_node == NULL && bli_thread_am_ichief( thread ) ) + bli_thrcomm_free( bli_thrinfo_icomm( thread ) ); + } + + // Free all children of the current thrinfo_t. + bli_l3_thrinfo_free( thrinfo_sub_node ); + + // Free the thrinfo_t struct. bli_free_intl( thread ); } // ----------------------------------------------------------------------------- +//#define PRINT_THRINFO + thrinfo_t** bli_l3_thrinfo_create_paths ( opid_t l3_op, @@ -207,6 +213,16 @@ thrinfo_t** bli_l3_thrinfo_create_paths dim_t jr_nt = ir_way; dim_t ir_nt = 1; +#ifdef PRINT_THRINFO +printf( " jc kc ic jr ir\n" ); +printf( "xx_way: %4lu %4lu %4lu %4lu %4lu\n", + jc_way, kc_way, ic_way, jr_way, ir_way ); +printf( "\n" ); +printf( " gl jc kc ic jr ir\n" ); +printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu\n", +global_num_threads, jc_nt, kc_nt, ic_nt, jr_nt, ir_nt ); +printf( "=======================================\n" ); +#endif thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); @@ -230,7 +246,8 @@ thrinfo_t** bli_l3_thrinfo_create_paths for( int e = 0; e < ir_way; e++ ) { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); + thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); + dim_t ir_comm_id = 0; dim_t jr_comm_id = e*ir_nt + ir_comm_id; dim_t ic_comm_id = d*jr_nt + jr_comm_id; @@ -238,84 +255,75 @@ thrinfo_t** bli_l3_thrinfo_create_paths dim_t jc_comm_id = b*kc_nt + kc_comm_id; dim_t global_comm_id = a*jc_nt + jc_comm_id; - // Macrokernel loops + // macro-kernel loops thrinfo_t* ir_info = bli_l3_thrinfo_create( jr_comm, jr_comm_id, ir_comm, ir_comm_id, ir_way, e, - NULL, NULL, NULL ); - + NULL ); thrinfo_t* jr_info = bli_l3_thrinfo_create( ic_comm, ic_comm_id, jr_comm, jr_comm_id, jr_way, d, - NULL, NULL, ir_info ); - //blk_var_1 + ir_info ); + // packa thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - thrinfo_t* pack_ic_out - = - bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - + ic_nt, ic_comm_id, + jr_info ); + // blk_var1 thrinfo_t* ic_info = bli_l3_thrinfo_create( kc_comm, kc_comm_id, ic_comm, ic_comm_id, ic_way, c, - pack_ic_out, pack_ic_in, jr_info ); - //blk_var_3 + pack_ic_in ); + // packb thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* pack_kc_out - = - bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - + kc_nt, kc_comm_id, + ic_info ); + // blk_var3 thrinfo_t* kc_info = bli_l3_thrinfo_create( jc_comm, jc_comm_id, kc_comm, kc_comm_id, kc_way, b, - pack_kc_out, pack_kc_in, ic_info ); - //blk_var_2 - thrinfo_t* pack_jc_in - = - bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* pack_jc_out - = - bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - + pack_kc_in ); + // blk_var2 thrinfo_t* jc_info = bli_l3_thrinfo_create( global_comm, global_comm_id, jc_comm, jc_comm_id, jc_way, a, - pack_jc_out, pack_jc_in, kc_info ); + kc_info ); paths[global_comm_id] = jc_info; + +#ifdef PRINT_THRINFO +printf( " gl jc kc ic jr ir\n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu\n", +global_comm_id, jc_comm_id, kc_comm_id, ic_comm_id, jr_comm_id, ir_comm_id ); +//printf( " a b c d e\n" ); +printf( "work ids: %4ld %4ld %4ld %4ld %4ld\n", (long int)a, (long int)b, (long int)c, (long int)d, (long int)e ); +printf( "---------------------------------------\n" ); +#endif + } } } } } +#ifdef PRINT_THRINFO +exit(1); +#endif + return paths; } diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 887fc9900..7eac72298 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -69,9 +69,7 @@ thrinfo_t* bli_l3_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ); void bli_l3_thrinfo_init @@ -83,9 +81,7 @@ void bli_l3_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single diff --git a/frame/3/bli_l3_var_oft.h b/frame/3/bli_l3_var_oft.h new file mode 100644 index 000000000..ef48d5e85 --- /dev/null +++ b/frame/3/bli_l3_var_oft.h @@ -0,0 +1,77 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_VAR_OFT_H +#define BLIS_L3_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + gemm_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( gemm ) + + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + trsm_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( trsm ) + + + +#endif + diff --git a/frame/3/bli_l3_voft.h b/frame/3/bli_l3_voft.h new file mode 100644 index 000000000..52210f172 --- /dev/null +++ b/frame/3/bli_l3_voft.h @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_VAR_OFT_H +#define BLIS_L3_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( gemm ) + + +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( trsm ) + + + +#endif + diff --git a/frame/cntl/bli_cntl_init.c b/frame/3/gemm/bli_gemm_blk_var1.c similarity index 56% rename from frame/cntl/bli_cntl_init.c rename to frame/3/gemm/bli_gemm_blk_var1.c index b7c53ec65..1a5693d8c 100644 --- a/frame/cntl/bli_cntl_init.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -34,71 +34,62 @@ #include "blis.h" -static bool_t bli_cntl_is_init = FALSE; - -void bli_cntl_init( void ) +void bli_gemm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // If the API is already initialized, return early. - if ( bli_cntl_is_initialized() ) return; + obj_t a1, c1; - // Level-1 - bli_scalv_cntl_init(); - bli_packv_cntl_init(); - bli_unpackv_cntl_init(); + dir_t direct; - // Level-1m - bli_scalm_cntl_init(); - bli_packm_cntl_init(); - bli_unpackm_cntl_init(); + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; - // Level-2 - bli_gemv_cntl_init(); - bli_ger_cntl_init(); - bli_hemv_cntl_init(); - bli_her_cntl_init(); - bli_her2_cntl_init(); - bli_trmv_cntl_init(); - bli_trsv_cntl_init(); + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); - // Level-3 - bli_gemm_cntl_init(); - bli_trsm_cntl_init(); + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_m( a, b, c, cntx ); - // Mark API as initialized. - bli_cntl_is_init = TRUE; -} - -void bli_cntl_finalize( void ) -{ - // Level-1 - bli_scalv_cntl_finalize(); - bli_packv_cntl_finalize(); - bli_unpackv_cntl_finalize(); - - // Level-1m - bli_scalm_cntl_finalize(); - bli_packm_cntl_finalize(); - bli_unpackm_cntl_finalize(); - - // Level-2 - bli_gemv_cntl_finalize(); - bli_ger_cntl_finalize(); - bli_hemv_cntl_finalize(); - bli_her_cntl_finalize(); - bli_her2_cntl_finalize(); - bli_trmv_cntl_finalize(); - bli_trsv_cntl_finalize(); - - // Level-3 - bli_gemm_cntl_finalize(); - bli_trsm_cntl_finalize(); - - // Mark API as uninitialized. - bli_cntl_is_init = FALSE; -} - -bool_t bli_cntl_is_initialized( void ) -{ - return bli_cntl_is_init; + // Determine the current thread's subpartition range. + bli_thread_get_range_mdim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + &a1, + b, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } } diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c deleted file mode 100644 index ee4a6a763..000000000 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - //The s is for "lives on the stack" - obj_t b_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack = NULL; - obj_t* b_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing B. - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize objects passed into bli_packm_init for A and C - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack B (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: Use of a (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c new file mode 100644 index 000000000..a65f8a20a --- /dev/null +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_ndim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + a, + &b1, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } +} + diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c deleted file mode 100644 index f44951a20..000000000 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t a_pack_s; - obj_t b1_pack_s, c1_pack_s; - - obj_t b1, c1; - obj_t* a_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A - bli_obj_init_pack( &a_pack_s ); - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for B and C that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &b1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_l2r( thread, b, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: Use of b (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for B1 and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a_pack, - b1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c new file mode 100644 index 000000000..7be9c6a58 --- /dev/null +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + &a1, + &b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread) + ); + + bli_thread_ibarrier( thread ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c is a locally aliased obj_t (see _int() function), we + // can simply overwrite the internal beta scalar with BLIS_ONE once + // it has been used in the first iteration. However... + + // Unlike variant 3 of gemm and herk, which reset the internal scalar + // on C at the end of the first iteration so that subsequent iterations + // do not erroneously apply beta more than once, it is important that + // this behavior not be applied to trmm. That is because the order of + // computation is always such that the beta that is passed into the + // macro-kernel must be zero, since the macro-kernel only applies that + // beta to (and thus overwrites) the row-panel of C that corresponds to + // the current block intersecting the diagonal. It turns out that this + // same pattern holds for trmm3 as well--except there, the beta scalar + // is potentially non-zero, but is still applied only to the current + // row-panel of C, and thus beta is applied to all of C exactly once. + // Thus, for neither trmm nor trmm3 should we reset the scalar on C + // after the first iteration. + if ( bli_cntx_get_family( cntx ) != BLIS_TRMM ) + if ( i == 0 ) bli_obj_scalar_reset( c ); + } +} + diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c deleted file mode 100644 index 073760900..000000000 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, b1_pack_s; - - obj_t a1, b1; - obj_t* a1_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c_pack = NULL; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - if( bli_thread_am_ochief( thread ) ){ - // Initialize object for packing C - bli_obj_init_pack( &c_pack_s ); - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - // Initialize pack objects for A and B that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ){ - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &b1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: We call a gemm/hemm/symm-specific function to determine - // the kc blocksize so that we can implement the "nudging" of kc - // to be a multiple of mr or nr, as needed. - b_alg = bli_gemm_determine_kc_f( i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and B1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a1_pack, - b1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); - - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 382b82bbd..3f3773418 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -34,140 +34,101 @@ #include "blis.h" -extern scalm_t* scalm_cntl; - -packm_t* gemm_packa_cntl = NULL; -packm_t* gemm_packb_cntl = NULL; - -gemm_t* gemm_cntl_bp_ke = NULL; -gemm_t* gemm_cntl_op_bp = NULL; -gemm_t* gemm_cntl_mm_op = NULL; -gemm_t* gemm_cntl_vl_mm = NULL; - -gemm_t* gemm_cntl = NULL; - -void bli_gemm_cntl_init() +cntl_t* bli_gemm_cntl_create + ( + opid_t family + ) { - // Create control tree objects for packm operations. - gemm_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MR, - BLIS_KR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - gemm_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_KR, - BLIS_NR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); + void* macro_kernel_p = bli_gemm_ker_var2; - // - // Create a control tree for packing A and B, and streaming C. - // + // Change the macro-kernel if the operation family is herk or trmm. + if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; - // Create control tree object for lowest-level block-panel kernel. - gemm_cntl_bp_ke - = - bli_gemm_cntl_obj_create( BLIS_UNB_OPT, - BLIS_VARIANT2, - 0, // bszid_t not used by macro-kernel - NULL, NULL, NULL, - NULL, NULL, NULL ); + // Create a node for the macro-kernel. + cntl_t* gemm_cntl_bp_ke = bli_gemm_cntl_obj_create + ( + BLIS_NR, // bszid not used by macro-kernel. + macro_kernel_p, + NULL // no sub-node; this is the leaf of the tree. + ); - // Create control tree object for outer panel (to block-panel) - // problem. - gemm_cntl_op_bp - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - gemm_packa_cntl, - gemm_packb_cntl, - NULL, - gemm_cntl_bp_ke, - NULL ); + // Create a node for packing matrix A. + cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_gemm_packa, + bli_packm_blk_var1, + BLIS_MR, + BLIS_KR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + gemm_cntl_bp_ke + ); - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates. - gemm_cntl_mm_op - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - gemm_cntl_op_bp, - NULL ); + // Create a node for partitioning the m dimension by MC. + cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create + ( + BLIS_MC, + bli_gemm_blk_var1, + gemm_cntl_packa + ); - // Create control tree object for very large problem via multiple - // general problems. - gemm_cntl_vl_mm - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - gemm_cntl_mm_op, - NULL ); + // Create a node for packing matrix B. + cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_gemm_packb, + bli_packm_blk_var1, + BLIS_KR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + gemm_cntl_op_bp + ); - // Alias the "master" gemm control tree to a shorter name. - gemm_cntl = gemm_cntl_vl_mm; + // Create a node for partitioning the k dimension by KC. + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + ( + BLIS_KC, + bli_gemm_blk_var3, + gemm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + ( + BLIS_NC, + bli_gemm_blk_var2, + gemm_cntl_mm_op + ); + + return gemm_cntl_vl_mm; } -void bli_gemm_cntl_finalize() +void bli_gemm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) { - bli_cntl_obj_free( gemm_packa_cntl ); - bli_cntl_obj_free( gemm_packb_cntl ); - - bli_cntl_obj_free( gemm_cntl_bp_ke ); - bli_cntl_obj_free( gemm_cntl_op_bp ); - bli_cntl_obj_free( gemm_cntl_mm_op ); - bli_cntl_obj_free( gemm_cntl_vl_mm ); + bli_cntl_free( cntl, thread ); } -gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_packm_a, - packm_t* sub_packm_b, - packm_t* sub_packm_c, - gemm_t* sub_gemm, - unpackm_t* sub_unpackm_c ) +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ) { - gemm_t* cntl; - - cntl = ( gemm_t* ) bli_malloc_intl( sizeof(gemm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bszid = bszid; - cntl->sub_scalm = sub_scalm; - cntl->sub_packm_a = sub_packm_a; - cntl->sub_packm_b = sub_packm_b; - cntl->sub_packm_c = sub_packm_c; - cntl->sub_gemm = sub_gemm; - cntl->sub_unpackm_c = sub_unpackm_c; - - return cntl; + return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 507a1dd14..5b985327c 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -32,31 +32,23 @@ */ -struct gemm_s -{ - impl_t impl_type; - varnum_t var_num; - bszid_t bszid; - struct scalm_s* sub_scalm; - struct packm_s* sub_packm_a; - struct packm_s* sub_packm_b; - struct packm_s* sub_packm_c; - struct gemm_s* sub_gemm; - struct unpackm_s* sub_unpackm_c; -}; -typedef struct gemm_s gemm_t; +cntl_t* bli_gemm_cntl_create + ( + opid_t family + ); -#define bli_cntl_sub_gemm( cntl ) cntl->sub_gemm +void bli_gemm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); -void bli_gemm_cntl_init( void ); -void bli_gemm_cntl_finalize( void ); -gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_pack_a, - packm_t* sub_pack_b, - packm_t* sub_pack_c, - gemm_t* sub_gemm, - unpackm_t* sub_unpack_c ); +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 47b5573c4..0782d7272 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_gemm_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_gemm_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -59,7 +62,7 @@ void bli_gemm_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -70,7 +73,7 @@ void bli_gemm_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, b_local ); @@ -79,22 +82,27 @@ void bli_gemm_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); + + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index 0176eef37..9f11f61d4 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -32,11 +32,13 @@ */ -void bli_gemm_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_gemm_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 171f2d6f1..18e531879 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -34,41 +34,22 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[6][3] = -{ - // unblocked optimized unblocked blocked - { NULL, NULL, bli_gemm_blk_var1f }, - { NULL, bli_gemm_ker_var2, bli_gemm_blk_var2f }, - { NULL, NULL, bli_gemm_blk_var3f }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL }, - { NULL, NULL, NULL } -}; - -void bli_gemm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; - varnum_t n; - impl_t i; - FUNCPTR_T f; + gemm_voft f; ind_t im; // Check parameters. @@ -82,7 +63,7 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - if( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; @@ -93,32 +74,20 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { - if( bli_thread_am_ochief( thread ) ) + // This should never execute. + bli_abort(); + + if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } - // Alias A and B in case we need to update attached scalars. + // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); - - // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); - // If we are about to call a leaf-level implementation, and matrix C - // still needs a transposition, then we must induce one by swapping the - // strides and dimensions. Note that this transposition would normally - // be handled explicitly in the packing of C, but if C is not being - // packed, this is our last chance to handle the transposition. - if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) - { - //if( bli_thread_am_ochief( thread ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); - // } - } - // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) @@ -133,29 +102,28 @@ void bli_gemm_int( obj_t* alpha, bli_obj_scalar_apply_scalar( beta, &c_local ); } - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { - if ( im == BLIS_3M3 && f == bli_gemm_blk_var1f ) f = bli_gemm_blk_var4f; - else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var4; - else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var3; + if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; + else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; + else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; } // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/gemm/bli_gemm_int.h index 9177122fd..e8580cf95 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/gemm/bli_gemm_int.h @@ -32,12 +32,15 @@ */ -void bli_gemm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 036876cb6..b44564387 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -56,12 +56,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); -void bli_gemm_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -236,7 +239,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/gemm/bli_gemm_packab.c new file mode 100644 index 000000000..c0166c828 --- /dev/null +++ b/frame/3/gemm/bli_gemm_packab.c @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack; + + // Pack matrix A according to the control tree node. + bli_l3_packm + ( + a, + &a_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + +// ----------------------------------------------------------------------------- + +void bli_gemm_packb + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b_pack; + + // Pack matrix B according to the control tree node. + bli_l3_packm + ( + b, + &b_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix B. + bli_gemm_int + ( + &BLIS_ONE, + a, + &b_pack, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 0f7ecdb11..c66587fda 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -46,20 +46,22 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( gemm_blk_var1f ) -GENPROT( gemm_blk_var2f ) -GENPROT( gemm_blk_var3f ) +GENPROT( gemm_blk_var1 ) +GENPROT( gemm_blk_var2 ) +GENPROT( gemm_blk_var3 ) +GENPROT( gemm_packa ) +GENPROT( gemm_packb ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: -GENPROT( gemm_blk_var4f ) // 3m3 -GENPROT( gemm_ker_var3 ) // 4m1b -GENPROT( gemm_ker_var4 ) // 3m2 +GENPROT( gemm3m3_packa ) // 3m3 +GENPROT( gemm4mb_ker_var2 ) // 4m1b +GENPROT( gemm3m2_ker_var2 ) // 3m2 // @@ -90,6 +92,6 @@ void PASTEMAC(ch,varname) \ INSERT_GENTPROT_BASIC( gemm_ker_var2 ) // Headers for induced algorithms: -INSERT_GENTPROT_BASIC( gemm_ker_var3 ) // 4m1b -INSERT_GENTPROT_BASIC( gemm_ker_var4 ) // 3m2 +INSERT_GENTPROT_BASIC( gemm4mb_ker_var2 ) // 4m1b +INSERT_GENTPROT_BASIC( gemm3m2_ker_var2 ) // 3m2 diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.c b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c similarity index 95% rename from frame/3/gemm/ind/bli_gemm_ker_var4.c rename to frame/3/gemm/ind/bli_gemm3m2_ker_var2.c index 3d5cd1859..ea8904183 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.c +++ b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c @@ -53,15 +53,18 @@ typedef void (*FUNCPTR_T)( thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var4); +static FUNCPTR_T GENARRAY(ftypes,gemm3m2_ker_var2); -void bli_gemm_ker_var4( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm3m2_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -238,7 +241,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ @@ -351,9 +354,9 @@ void PASTEMAC(ch,varname) \ } \ } \ \ -/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var4: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var4: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } -INSERT_GENTFUNC_BASIC0( gemm_ker_var4 ) +INSERT_GENTFUNC_BASIC0( gemm3m2_ker_var2 ) diff --git a/frame/3/gemm/ind/bli_gemm3m3_packa.c b/frame/3/gemm/ind/bli_gemm3m3_packa.c new file mode 100644 index 000000000..516047213 --- /dev/null +++ b/frame/3/gemm/ind/bli_gemm3m3_packa.c @@ -0,0 +1,142 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm3m3_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack; + + // Make a copy of the context for each stage. + cntx_t cntx_ro = *cntx; + cntx_t cntx_io = *cntx; + cntx_t cntx_rpi = *cntx; + + // ----------------------------------------------------- + + // Initialize the context for the real-only stage. + bli_gemm3m3_cntx_stage( 0, &cntx_ro ); + + // Pack matrix the real-only part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_ro, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // Only apply beta within the first of three subproblems. + bli_obj_scalar_reset( c ); + + // ----------------------------------------------------- + + // Initialize the context for the imag-only stage. + bli_gemm3m3_cntx_stage( 1, &cntx_io ); + + // Pack matrix the imag-only part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_io, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // ----------------------------------------------------- + + // Initialize the context for the real+imag stage. + bli_gemm3m3_cntx_stage( 2, &cntx_rpi ); + + // Pack matrix the real+imag part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_rpi, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + +} + diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c similarity index 96% rename from frame/3/gemm/ind/bli_gemm_ker_var3.c rename to frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index 11c684810..d9d714917 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -53,15 +53,18 @@ typedef void (*FUNCPTR_T)( thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var3); +static FUNCPTR_T GENARRAY(ftypes,gemm4mb_ker_var2); -void bli_gemm_ker_var3( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm4mb_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -238,7 +241,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ @@ -349,5 +352,5 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var3: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } -INSERT_GENTFUNC_BASIC0( gemm_ker_var3 ) +INSERT_GENTFUNC_BASIC0( gemm4mb_ker_var2 ) diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4f.c b/frame/3/gemm/ind/bli_gemm_blk_var4f.c deleted file mode 100644 index 9308014d0..000000000 --- a/frame/3/gemm/ind/bli_gemm_blk_var4f.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var4f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - //The s is for "lives on the stack" - obj_t b_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack = NULL; - obj_t* b_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - // Make a copy of the context for each stage. - cntx_t cntx_ro = *cntx; - cntx_t cntx_io = *cntx; - cntx_t cntx_rpi = *cntx; - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing B. - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by - // chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize objects passed into bli_packm_init for A and C - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack B (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: Use of a (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - - // Initialize the context for the real-only stage. - bli_gemm3m3_cntx_stage( 0, &cntx_ro ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_ro, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - &cntx_ro, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_ro, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - &cntx_ro, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (real-only). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Only apply beta within the first of three subproblems. - if ( bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack ); - - - // Initialize the context for the imag-only stage. - bli_gemm3m3_cntx_stage( 1, &cntx_io ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_io, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_io, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (imag-only). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Initialize the context for the real+imag stage. - bli_gemm3m3_cntx_stage( 2, &cntx_rpi ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_rpi, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_rpi, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (real+imag). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - // It doesn't matter which packm cntl node we pass in, as long - // as it is valid, packm_release() will release the mem_t entry - // stored in a1_pack. - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4f.h b/frame/3/gemm/ind/bli_gemm_blk_var4f.h deleted file mode 100644 index 289e76550..000000000 --- a/frame/3/gemm/ind/bli_gemm_blk_var4f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_gemm_blk_var4f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.h b/frame/3/gemm/ind/bli_gemm_ker_var3.h deleted file mode 100644 index 042120185..000000000 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_gemm_ker_var3( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( gemm_ker_var3 ) - diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.h b/frame/3/gemm/ind/bli_gemm_ker_var4.h deleted file mode 100644 index 95268de2a..000000000 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_gemm_ker_var4( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( gemm_ker_var4 ) - diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 21bda90da..ed7e03b9c 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_hemm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_hemm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -60,7 +63,7 @@ void bli_hemm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -71,7 +74,7 @@ void bli_hemm_front( side_t side, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_toggle_conj( a_local ); @@ -86,22 +89,27 @@ void bli_hemm_front( side_t side, bli_obj_swap( a_local, b_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h index 840b24791..e1d40c80e 100644 --- a/frame/3/hemm/bli_hemm_front.h +++ b/frame/3/hemm/bli_hemm_front.h @@ -32,12 +32,14 @@ */ -void bli_hemm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_hemm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 263155de2..f72dedf87 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_her2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_her2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t alpha_conj; obj_t c_local; @@ -64,7 +67,7 @@ void bli_her2k_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -91,7 +94,7 @@ void bli_her2k_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); @@ -104,49 +107,43 @@ void bli_her2k_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } -#if 0 - // Invoke the internal back-end. - bli_her2k_int( alpha, - &a_local, - &bh_local, - &alpha_conj, - &b_local, - &ah_local, - beta, - &c_local, - cntl ); -#else + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &bh_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &bh_local, + beta, + &c_local, + cntx, + cntl, + infos + ); - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + &alpha_conj, + &b_local, + &ah_local, + &BLIS_ONE, + &c_local, + cntx, + cntl, + infos + ); - bli_l3_thrinfo_free_paths( infos, n_threads ); - -#endif + bli_l3_thrinfo_free_paths( infos, n_threads ); // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of @@ -155,6 +152,5 @@ void bli_her2k_front( obj_t* alpha, // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); - } diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h index 8a699c4c4..6f1246ea6 100644 --- a/frame/3/her2k/bli_her2k_front.h +++ b/frame/3/her2k/bli_her2k_front.h @@ -32,11 +32,13 @@ */ -void bli_her2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_her2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h index 290b8bda3..d9aebc78b 100644 --- a/frame/3/herk/bli_herk.h +++ b/frame/3/herk/bli_herk.h @@ -33,7 +33,6 @@ */ #include "bli_herk_front.h" -#include "bli_herk_int.h" #include "bli_herk_var.h" diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c deleted file mode 100644 index 95bc56f9c..000000000 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var1f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t ah_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack; - obj_t* c1_pack; - obj_t* ah_pack; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_m( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A'. - bli_obj_init_pack( &ah_pack_s ); - bli_packm_init( ah, &ah_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - ah_pack = bli_thread_obroadcast( thread, &ah_pack_s ); - - // Initialize pack objects that are passed into packm_init() for A and C. - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A' (if instructed). - bli_packm_int( ah, ah_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_t2b( thread, c, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a1_pack, - ah_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( ah_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c deleted file mode 100644 index de7f6c972..000000000 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var2f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t a_pack_s; - obj_t ah1_pack_s, c1_pack_s; - - obj_t ah1, c1; - obj_t* a_pack; - obj_t* ah1_pack; - obj_t* c1_pack; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_n( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A - bli_obj_init_pack( &a_pack_s ); - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for C and A' that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &ah1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_l2r( thread, c, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1' and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1' and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ) ; - - // Pack A1' (if instructed). - bli_packm_int( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ) ; - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a_pack, - ah1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c deleted file mode 100644 index 7e82ba87f..000000000 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var3f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, ah1_pack_s; - - obj_t a1, ah1; - obj_t* a1_pack = NULL; - obj_t* ah1_pack = NULL; - obj_t* c_pack = NULL; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_k( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing C. - bli_obj_init_pack( &c_pack_s ); - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - // Initialize all pack objects that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &ah1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, k_trans, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and A1'. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); - - // Initialize objects for packing A1 and A1'. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a1_pack, - ah1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); - - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) { - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - } - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index a4bd5ef0b..3abfa9baf 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_herk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_herk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t ah_local; @@ -60,7 +63,7 @@ void bli_herk_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -76,7 +79,7 @@ void bli_herk_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_conj( ah_local ); @@ -84,22 +87,28 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &ah_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &ah_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + + bli_l3_thrinfo_free_paths( infos, n_threads ); // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of @@ -108,6 +117,5 @@ void bli_herk_front( obj_t* alpha, // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); - } diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h index c778399d0..ef9325969 100644 --- a/frame/3/herk/bli_herk_front.h +++ b/frame/3/herk/bli_herk_front.h @@ -32,10 +32,12 @@ */ -void bli_herk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_herk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 94d6f6a77..c36b6b826 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -57,12 +57,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); -void bli_herk_l_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_l_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -270,7 +273,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index cc137d989..56da59f1a 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -57,12 +57,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); -void bli_herk_u_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_u_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -270,7 +273,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/herk/bli_herk_var.h index 03d9b9ff5..a18c9ab49 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/herk/bli_herk_var.h @@ -46,16 +46,19 @@ void PASTEMAC0(opname) \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( herk_blk_var1f ) -GENPROT( herk_blk_var2f ) -GENPROT( herk_blk_var3f ) +//GENPROT( herk_blk_var1 ) +//GENPROT( herk_blk_var2 ) +//GENPROT( herk_blk_var3 ) +GENPROT( herk_x_ker_var2 ) GENPROT( herk_l_ker_var2 ) GENPROT( herk_u_ker_var2 ) +//GENPROT( herk_packa ) +//GENPROT( herk_packb ) // diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/herk/bli_herk_x_ker_var2.c new file mode 100644 index 000000000..71a4cc59b --- /dev/null +++ b/frame/3/herk/bli_herk_x_ker_var2.c @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static gemm_voft vars[2] = +{ + bli_herk_l_ker_var2, bli_herk_u_ker_var2, +}; + +void bli_herk_x_ker_var2 + ( + obj_t* a, + obj_t* ah, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + bool_t uplo; + gemm_voft f; + + // Set a bool based on the uplo field of C's root object. + if ( bli_obj_root_is_lower( *c ) ) uplo = 0; + else uplo = 1; + + // Index into the variant array to extract the correct function pointer. + f = vars[uplo]; + + // Call the macrokernel. + f + ( + a, + ah, + c, + cntx, + cntl, + thread + ); +} + diff --git a/frame/3/herk/old/bli_herk_blk_var1.c b/frame/3/herk/old/bli_herk_blk_var1.c new file mode 100644 index 000000000..59a20e878 --- /dev/null +++ b/frame/3/herk/old/bli_herk_blk_var1.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_m( a, b, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_mdim + ( + direct, thread, a, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a1, + b, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/herk/old/bli_herk_blk_var2.c b/frame/3/herk/old/bli_herk_blk_var2.c new file mode 100644 index 000000000..739ae0341 --- /dev/null +++ b/frame/3/herk/old/bli_herk_blk_var2.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_n( a, ah, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_ndim + ( + direct, thread, b, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a, + b1, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/herk/old/bli_herk_blk_var2f.h b/frame/3/herk/old/bli_herk_blk_var2f.h deleted file mode 100644 index f436a0082..000000000 --- a/frame/3/herk/old/bli_herk_blk_var2f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_herk_blk_var2f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - diff --git a/frame/3/herk/old/bli_herk_blk_var3.c b/frame/3/herk/old/bli_herk_blk_var3.c new file mode 100644 index 000000000..949ab53da --- /dev/null +++ b/frame/3/herk/old/bli_herk_blk_var3.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_k( a, b, c ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + // Notice that, unlike with gemm/hemm/symm/trmm/trsm, we do not need + // to call a kc-specific routine. We do not need kc to be a multiple + // of MR or NR since neither A nor B has structure in herk. + b_alg = bli_determine_blocksize( direct, i, k_trans, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a1, + b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c is an aliased obj_t (see _int() function), we can + // simply overwrite the internal beta scalar with BLIS_ONE once it + // has been used in the first iteration. + if ( i == 0 && bli_thread_am_ichief( thread ) ) + bli_obj_scalar_reset( c ); + } +} + diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/old/bli_herk_int.c similarity index 67% rename from frame/3/herk/bli_herk_int.c rename to frame/3/herk/old/bli_herk_int.c index 643a46ba4..b7d58940b 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/old/bli_herk_int.c @@ -34,51 +34,38 @@ #include "blis.h" -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][4][3] = +#if 0 +static gemm_voft vars[4][3] = { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_herk_blk_var1f }, - { NULL, bli_herk_l_ker_var2, bli_herk_blk_var2f }, - { NULL, NULL, bli_herk_blk_var3f }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_herk_blk_var1f }, - { NULL, bli_herk_u_ker_var2, bli_herk_blk_var2f }, - { NULL, NULL, bli_herk_blk_var3f }, - { NULL, NULL, NULL }, - } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_herk_blk_var1 }, + { NULL, bli_herk_x_ker_var2, bli_herk_blk_var2 }, + { NULL, NULL, bli_herk_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_herk_int( obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_int + ( + obj_t* alpha, + obj_t* a, + obj_t* ah, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t ah_local; obj_t c_local; +#if 0 + bool_t uplo; +#endif varnum_t n; impl_t i; - bool_t uplo; - FUNCPTR_T f; + gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -91,9 +78,9 @@ void bli_herk_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *ah ) ) { - if( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_obarrier( thread ); return; } @@ -109,43 +96,55 @@ void bli_herk_int( obj_t* alpha, // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. +#if 0 if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } +#endif // If alpha is non-unit, typecast and apply it to the scalar // attached to A'. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &ah_local ); + bli_obj_scalar_apply_scalar( alpha, &ah_local ); } // If beta is non-unit, typecast and apply it to the scalar // attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } +#if 0 // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c_local ) ) uplo = 0; else uplo = 1; +#endif +#if 0 // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. - f = vars[uplo][n][i]; + f = vars[n][i]; +#endif + + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Invoke the variant. - f( &a_local, - &ah_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &ah_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/herk/bli_herk_int.h b/frame/3/herk/old/bli_herk_int.h similarity index 86% rename from frame/3/herk/bli_herk_int.h rename to frame/3/herk/old/bli_herk_int.h index 80442d228..1e649b968 100644 --- a/frame/3/herk/bli_herk_int.h +++ b/frame/3/herk/old/bli_herk_int.h @@ -32,12 +32,14 @@ */ -void bli_herk_int( obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - +void bli_herk_int + ( + obj_t* alpha, + obj_t* a, + obj_t* ah, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/herk/old/bli_herk_l_ker_var2.h b/frame/3/herk/old/bli_herk_l_ker_var2.h deleted file mode 100644 index 09656596d..000000000 --- a/frame/3/herk/old/bli_herk_l_ker_var2.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_herk_l_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - herk_thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( herk_l_ker_var2 ) - diff --git a/frame/3/herk/old/bli_herk_thread.c b/frame/3/herk/old/bli_herk_thread.c deleted file mode 100644 index 6bb9d6e98..000000000 --- a/frame/3/herk/old/bli_herk_thread.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_herk_thrinfo_create_paths( void ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/3/herk/old/bli_herk_u_ker_var2.h b/frame/3/herk/old/bli_herk_u_ker_var2.h deleted file mode 100644 index 0701db148..000000000 --- a/frame/3/herk/old/bli_herk_u_ker_var2.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_herk_u_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - herk_thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( herk_u_ker_var2 ) - diff --git a/frame/3/old/bli_herk_direct.c b/frame/3/old/bli_herk_direct.c new file mode 100644 index 000000000..729812e84 --- /dev/null +++ b/frame/3/old/bli_herk_direct.c @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* ah, + obj_t* c + ) +{ + return BLIS_FWD; +} + diff --git a/frame/cntl/bli_cntl_init.h b/frame/3/old/bli_herk_direct.h similarity index 94% rename from frame/cntl/bli_cntl_init.h rename to frame/3/old/bli_herk_direct.h index a3fdf6279..1f027561c 100644 --- a/frame/cntl/bli_cntl_init.h +++ b/frame/3/old/bli_herk_direct.h @@ -32,6 +32,9 @@ */ -void bli_cntl_init( void ); -void bli_cntl_finalize( void ); -bool_t bli_cntl_is_initialized( void ); +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* ah, + obj_t* c + ); diff --git a/frame/3/herk/old/bli_herk_blk_var3f.h b/frame/3/old/bli_trmm_direct.c similarity index 78% rename from frame/3/herk/old/bli_herk_blk_var3f.h rename to frame/3/old/bli_trmm_direct.c index 800a44b8d..43be1b16a 100644 --- a/frame/3/herk/old/bli_herk_blk_var3f.h +++ b/frame/3/old/bli_trmm_direct.c @@ -32,10 +32,28 @@ */ -void bli_herk_blk_var3f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); +#include "blis.h" + +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + + return direct; +} diff --git a/frame/cntl/bli_cntl.c b/frame/3/old/bli_trmm_direct.h similarity index 95% rename from frame/cntl/bli_cntl.c rename to frame/3/old/bli_trmm_direct.h index ffd6120c8..905ba8fc9 100644 --- a/frame/cntl/bli_cntl.c +++ b/frame/3/old/bli_trmm_direct.h @@ -32,9 +32,10 @@ */ -#include "blis.h" +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ); -void bli_cntl_obj_free( void* cntl ) -{ - bli_free_intl( cntl ); -} diff --git a/frame/3/old/bli_trsm_direct.c b/frame/3/old/bli_trsm_direct.c new file mode 100644 index 000000000..c640705c8 --- /dev/null +++ b/frame/3/old/bli_trsm_direct.c @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + + return direct; +} + diff --git a/frame/3/old/bli_trsm_direct.h b/frame/3/old/bli_trsm_direct.h new file mode 100644 index 000000000..d7e7c206b --- /dev/null +++ b/frame/3/old/bli_trsm_direct.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ); + diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 79208b699..b864ce06a 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_symm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_symm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -60,7 +63,7 @@ void bli_symm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -71,7 +74,7 @@ void bli_symm_front( side_t side, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( b_local ); @@ -85,22 +88,27 @@ void bli_symm_front( side_t side, bli_obj_swap( a_local, b_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h index 1fb9ec019..6ba9a5aeb 100644 --- a/frame/3/symm/bli_symm_front.h +++ b/frame/3/symm/bli_symm_front.h @@ -32,12 +32,14 @@ */ -void bli_symm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_symm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 2fa47d27a..936c43635 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_syr2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_syr2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t c_local; obj_t a_local; @@ -61,7 +64,7 @@ void bli_syr2k_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -80,52 +83,47 @@ void bli_syr2k_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( c_local ); } -#if 0 - // Invoke the internal back-end. - bli_her2k_int( alpha, - &a_local, - &bt_local, - alpha, - &b_local, - &at_local, - beta, - &c_local, - cntl ); -#else + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); + // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &bt_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &bt_local, + beta, + &c_local, + cntx, + cntl, + infos + ); - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); -#endif + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &b_local, + &at_local, + &BLIS_ONE, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h index 674dfe5ce..8d227c125 100644 --- a/frame/3/syr2k/bli_syr2k_front.h +++ b/frame/3/syr2k/bli_syr2k_front.h @@ -32,11 +32,13 @@ */ -void bli_syr2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_syr2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 54ca2bf8a..8b379ab0e 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_syrk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_syrk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t at_local; @@ -58,7 +61,7 @@ void bli_syrk_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -73,27 +76,32 @@ void bli_syrk_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( c_local ); } - - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &at_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &at_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h index c7ab2a7b7..73f58baef 100644 --- a/frame/3/syrk/bli_syrk_front.h +++ b/frame/3/syrk/bli_syrk_front.h @@ -32,10 +32,12 @@ */ -void bli_syrk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_syrk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trmm/bli_trmm.h b/frame/3/trmm/bli_trmm.h index 056fedb50..4eeec84e0 100644 --- a/frame/3/trmm/bli_trmm.h +++ b/frame/3/trmm/bli_trmm.h @@ -33,7 +33,6 @@ */ #include "bli_trmm_front.h" -#include "bli_trmm_int.h" #include "bli_trmm_var.h" diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 9b860405c..689acbb72 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_trmm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - gemm_t* cntl ) +void bli_trmm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -58,7 +61,7 @@ void bli_trmm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -104,7 +107,7 @@ void bli_trmm_front( side_t side, // NOTE: We disable the optimization for 1x1 matrices since the concept // of row- vs. column storage breaks down. if ( !bli_obj_is_1x1( c_local ) ) - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); @@ -127,24 +130,28 @@ void bli_trmm_front( side_t side, bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trmm_int, - alpha, - &a_local, - &b_local, - &BLIS_ZERO, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + &BLIS_ZERO, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h index a05284336..7a263fdb1 100644 --- a/frame/3/trmm/bli_trmm_front.h +++ b/frame/3/trmm/bli_trmm_front.h @@ -32,10 +32,12 @@ */ -void bli_trmm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_trmm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 34928b04d..cc729834b 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); -void bli_trmm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -308,7 +311,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 382d54952..eacf91795 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); -void bli_trmm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -315,7 +318,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 72ac03a14..f8b09a3f5 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); -void bli_trmm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -315,7 +318,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 0bae832d3..3fb94c9d6 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); -void bli_trmm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -316,7 +319,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index e10166401..d3ac2fa34 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -46,17 +46,15 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( trmm_blk_var1f ) -//GENPROT( trmm_blk_var1b ) // variant doesn't exist b/c it's not needed -GENPROT( trmm_blk_var2f ) -GENPROT( trmm_blk_var2b ) -GENPROT( trmm_blk_var3f ) -GENPROT( trmm_blk_var3b ) +//GENPROT( trmm_blk_var1 ) +//GENPROT( trmm_blk_var2 ) +//GENPROT( trmm_blk_var3 ) +GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) diff --git a/frame/cntl/bli_cntl.h b/frame/3/trmm/bli_trmm_xx_ker_var2.c similarity index 61% rename from frame/cntl/bli_cntl.h rename to frame/3/trmm/bli_trmm_xx_ker_var2.c index c53270f9b..cbec35678 100644 --- a/frame/cntl/bli_cntl.h +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -32,53 +32,56 @@ */ -#include "bli_cntl_init.h" +#include "blis.h" -typedef enum +static gemm_voft vars[2][2] = { - BLIS_UNBLOCKED = 0, - BLIS_UNB_FUSED = 1, - BLIS_UNB_OPT = 1, - BLIS_BLOCKED = 2 -} impl_t; + { bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 }, + { bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 } +}; -typedef enum +void bli_trmm_xx_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - BLIS_VARIANT1 = 0, - BLIS_VARIANT2, - BLIS_VARIANT3, - BLIS_VARIANT4, - BLIS_VARIANT5, - BLIS_VARIANT6, - BLIS_VARIANT7, - BLIS_VARIANT8, - BLIS_VARIANT9, -} varnum_t; + bool_t side; + bool_t uplo; + gemm_voft f; + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( *a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( *a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( *b ) ) uplo = 0; + else uplo = 1; + } -void bli_cntl_obj_free( void* cntl ); + // Index into the variant array to extract the correct function pointer. + f = vars[side][uplo]; - - -// -- Control tree accessor macros (common to many node types) -- - -#define bli_cntl_impl_type( cntl ) cntl->impl_type -#define bli_cntl_var_num( cntl ) cntl->var_num -#define bli_cntl_bszid( cntl ) cntl->bszid - - - -// -- Control tree query macros -- - -#define bli_cntl_is_noop( cntl ) \ -\ - ( cntl == NULL ) - -#define bli_cntl_is_leaf( cntl ) \ -\ - ( bli_cntl_impl_type( cntl ) != BLIS_BLOCKED ) - -#define bli_cntl_is_blocked( cntl ) \ -\ - ( bli_cntl_impl_type( cntl ) == BLIS_BLOCKED ) + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread + ); +} diff --git a/frame/3/trmm/old/bli_trmm_blk_var1.c b/frame/3/trmm/old/bli_trmm_blk_var1.c new file mode 100644 index 000000000..9f2e91d07 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var1.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_m( a, b, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_mdim + ( + direct, thread, a, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a1, + b, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/old/bli_trmm_blk_var1f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var1f.c rename to frame/3/trmm/old/bli_trmm_blk_var1f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var1f.h b/frame/3/trmm/old/bli_trmm_blk_var1f.h deleted file mode 100644 index e0876af88..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var1f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/old/bli_trmm_blk_var2.c b/frame/3/trmm/old/bli_trmm_blk_var2.c new file mode 100644 index 000000000..df5f58614 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var2.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_n( a, b, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_ndim + ( + direct, thread, b, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a, + b1, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/old/bli_trmm_blk_var2b.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var2b.c rename to frame/3/trmm/old/bli_trmm_blk_var2b.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var2b.h b/frame/3/trmm/old/bli_trmm_blk_var2b.h deleted file mode 100644 index 35f41a9af..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var2b.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var2b( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/old/bli_trmm_blk_var2f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var2f.c rename to frame/3/trmm/old/bli_trmm_blk_var2f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var2f.h b/frame/3/trmm/old/bli_trmm_blk_var2f.h deleted file mode 100644 index 7ed265e42..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var2f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/old/bli_trmm_blk_var3.c b/frame/3/trmm/old/bli_trmm_blk_var3.c new file mode 100644 index 000000000..2957cf153 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var3.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_k( a, b, c ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_trmm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a1, + b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + + // Unlike variant 3 of gemm and herk, which reset the internal scalar + // on C at the end of the first iteration so that subsequent iterations + // do not erroneously apply beta more than once, it is important that + // this behavior not be applied to trmm. That is because the order of + // computation is always such that beta must be zero, since the macro- + // kernel only applies beta to the row-panel of C that corresponds to + // the current block intersecting the diagonal. It turns out that this + // same pattern works for trmm3 as well--by only applying beta to + // the current row-panel of C, beta is applied to all of C exactly + // once. Thus, for neither trmm nor trmm3 should we reset the scalar + // on C after the first iteration. + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/old/bli_trmm_blk_var3b.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var3b.c rename to frame/3/trmm/old/bli_trmm_blk_var3b.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var3b.h b/frame/3/trmm/old/bli_trmm_blk_var3b.h deleted file mode 100644 index 4e9113c6a..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var3b.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var3b( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/old/bli_trmm_blk_var3f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var3f.c rename to frame/3/trmm/old/bli_trmm_blk_var3f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var3f.h b/frame/3/trmm/old/bli_trmm_blk_var3f.h deleted file mode 100644 index 50d8c6bbb..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var3f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/old/bli_trmm_int.c similarity index 64% rename from frame/3/trmm/bli_trmm_int.c rename to frame/3/trmm/old/bli_trmm_int.c index d39722e95..830a22d1f 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/old/bli_trmm_int.c @@ -34,73 +34,38 @@ #include "blis.h" -#define FUNCPTR_T trmm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][2][4][3] = +#if 0 +static gemm_voft vars[4][3] = { - // left - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_ll_ker_var2, bli_trmm_blk_var2b }, - { NULL, NULL, bli_trmm_blk_var3b }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_lu_ker_var2, bli_trmm_blk_var2f }, - { NULL, NULL, bli_trmm_blk_var3f }, - { NULL, NULL, NULL }, - } - }, - // right - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_rl_ker_var2, bli_trmm_blk_var2f }, - { NULL, NULL, bli_trmm_blk_var3f }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_ru_ker_var2, bli_trmm_blk_var2b }, - { NULL, NULL, bli_trmm_blk_var3b }, - { NULL, NULL, NULL }, - } - } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_trmm_blk_var1 }, + { NULL, bli_trmm_xx_ker_var2, bli_trmm_blk_var2 }, + { NULL, NULL, bli_trmm_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_trmm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; +#if 0 bool_t side, uplo; +#endif varnum_t n; impl_t i; - FUNCPTR_T f; + gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -131,11 +96,13 @@ void bli_trmm_int( obj_t* alpha, // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. +#if 0 if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } +#endif // If alpha is non-unit, typecast and apply it to the scalar attached // to B. @@ -151,6 +118,7 @@ void bli_trmm_int( obj_t* alpha, bli_obj_scalar_apply_scalar( beta, &c_local ); } +#if 0 // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). @@ -163,24 +131,32 @@ void bli_trmm_int( obj_t* alpha, else // if ( bli_obj_root_is_triangular( *b ) ) { side = 1; - // Set a bool based on the uplo field of A's root object. if ( bli_obj_root_is_lower( *b ) ) uplo = 0; else uplo = 1; } +#endif +#if 0 // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo][n][i]; + f = vars[n][i]; +#endif + + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/trmm/bli_trmm_int.h b/frame/3/trmm/old/bli_trmm_int.h similarity index 86% rename from frame/3/trmm/bli_trmm_int.h rename to frame/3/trmm/old/bli_trmm_int.h index e529d02f6..697fc06b5 100644 --- a/frame/3/trmm/bli_trmm_int.h +++ b/frame/3/trmm/old/bli_trmm_int.h @@ -32,11 +32,15 @@ */ -void bli_trmm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_trmm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); + diff --git a/frame/3/trmm/old/bli_trmm_ll_ker_var2.h b/frame/3/trmm/old/bli_trmm_ll_ker_var2.h deleted file mode 100644 index 384defe09..000000000 --- a/frame/3/trmm/old/bli_trmm_ll_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_lu_ker_var2.h b/frame/3/trmm/old/bli_trmm_lu_ker_var2.h deleted file mode 100644 index 74a17e6b4..000000000 --- a/frame/3/trmm/old/bli_trmm_lu_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_rl_ker_var2.h b/frame/3/trmm/old/bli_trmm_rl_ker_var2.h deleted file mode 100644 index 64d1128fb..000000000 --- a/frame/3/trmm/old/bli_trmm_rl_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_ru_ker_var2.h b/frame/3/trmm/old/bli_trmm_ru_ker_var2.h deleted file mode 100644 index 3df303b60..000000000 --- a/frame/3/trmm/old/bli_trmm_ru_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_thread.c b/frame/3/trmm/old/bli_trmm_thread.c deleted file mode 100644 index b17c30dd6..000000000 --- a/frame/3/trmm/old/bli_trmm_thread.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_trmm_thrinfo_create_paths( bool_t jc_dependency ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); - - if ( jc_dependency ) - { - jr_way *= jc_way; - jc_way = 1; - } -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 19090bee5..e9e9261f0 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_trmm3_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_trmm3_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -60,7 +63,7 @@ void bli_trmm3_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -103,7 +106,7 @@ void bli_trmm3_front( side_t side, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); @@ -127,22 +130,27 @@ void bli_trmm3_front( side_t side, bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trmm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h index 052d83249..ed158c0b8 100644 --- a/frame/3/trmm3/bli_trmm3_front.h +++ b/frame/3/trmm3/bli_trmm3_front.h @@ -32,11 +32,14 @@ */ -void bli_trmm3_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); +void bli_trmm3_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c new file mode 100644 index 000000000..a731d8265 --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_mdim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + &a1, + b, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } +} + diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c deleted file mode 100644 index b3a73da6e..000000000 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trsm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) -{ - obj_t b_pack_s; - obj_t a1_pack_s; - - obj_t a1, c1; - obj_t* b_pack = NULL; - obj_t* a1_pack = NULL; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_m( a, b, c ); - - // Initialize object for packing B. - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize object for packing B. - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - - // Pack B1 (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, - ( bli_obj_root_is_triangular( *a ) ? - bli_cntx_get_bmult( BLIS_MR, cntx ) : - bli_cntx_get_bmult( BLIS_NR, cntx ) ), - &my_start, &my_end ); - - // Partition along the remaining portion of the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize object for packing A1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - &c1, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ) - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); -} - diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c new file mode 100644 index 000000000..a133f0bb0 --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_ndim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + a, + &b1, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } +} + diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c deleted file mode 100644 index 42d65100e..000000000 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trsm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) -{ - obj_t a_pack_s; - obj_t b1_pack_s, c1_pack_s; - - obj_t b1, c1; - obj_t* a_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_n( a, b, c ); - - // Initialize pack objects for A that are passed into packm_init(). - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &a_pack_s ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for B and C that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &b1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_l2r( thread, b, - ( bli_obj_root_is_triangular( *b ) ? - bli_cntx_get_bmult( BLIS_MR, cntx ) : - bli_cntx_get_bmult( BLIS_NR, cntx ) ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for B1 and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a_pack, - b1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c new file mode 100644 index 000000000..9d726389f --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -0,0 +1,102 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + &a1, + &b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + bli_thread_ibarrier( thread ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal alpha scalars on A/B and C are non-zero, we must ensure + // that they are only used in the first iteration. + if ( i == 0 ) + { + bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c ); + } + } +} + diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c deleted file mode 100644 index 52cfb1fc5..000000000 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ /dev/null @@ -1,162 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trsm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, b1_pack_s; - - obj_t a1, b1; - obj_t* a1_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c_pack = NULL; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_k( a, b, c ); - - // Initialize pack objects for C that are passed into packm_init(). - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &c_pack_s ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &b1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: We call a trsm-specific function to determine the kc - // blocksize so that we can implement the "nudging" of kc to be - // a multiple of mr, as needed. - b_alg = bli_trsm_determine_kc_f( i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and B1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a1_pack, - b1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal alpha scalars on A/B and C are non-zero, we must ensure - // that they are only used in the first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) { - bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( c_pack ); - } - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) { - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - } - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 3a83faafd..b4f7422ba 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -34,235 +34,177 @@ #include "blis.h" -extern scalm_t* scalm_cntl; - -extern gemm_t* gemm_cntl_bp_ke; - -packm_t* trsm_l_packa_cntl = NULL; -packm_t* trsm_l_packb_cntl = NULL; - -packm_t* trsm_r_packa_cntl = NULL; -packm_t* trsm_r_packb_cntl = NULL; - -trsm_t* trsm_cntl_bp_ke = NULL; - -trsm_t* trsm_l_cntl_op_bp = NULL; -trsm_t* trsm_l_cntl_mm_op = NULL; -trsm_t* trsm_l_cntl_vl_mm = NULL; - -trsm_t* trsm_r_cntl_op_bp = NULL; -trsm_t* trsm_r_cntl_mm_op = NULL; -trsm_t* trsm_r_cntl_vl_mm = NULL; - -trsm_t* trsm_l_cntl = NULL; -trsm_t* trsm_r_cntl = NULL; - - -void bli_trsm_cntl_init() +cntl_t* bli_trsm_cntl_create + ( + side_t side + ) { - - // Create control tree objects for packm operations (left side). - trsm_l_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - // IMPORTANT: n dim multiple must be mr to - // support right and bottom-right edge cases - BLIS_MR, - BLIS_MR, - TRUE, // invert diagonal - TRUE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - trsm_l_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - // IMPORTANT: m dim multiple must be mr since - // B_pack is updated (ie: serves as C) in trsm - BLIS_MR, - BLIS_NR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); - - // Create control tree objects for packm operations (right side). - trsm_r_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_NR, - BLIS_MR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - trsm_r_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // pack panels of B compactly - BLIS_MR, - BLIS_MR, - TRUE, // invert diagonal - FALSE, // reverse iteration if upper? - TRUE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); - - - // Create control tree object for lowest-level block-panel kernel. - trsm_cntl_bp_ke - = - bli_trsm_cntl_obj_create( BLIS_UNB_OPT, - BLIS_VARIANT2, - 0, // bszid_t not used by macro-kernel - NULL, NULL, NULL, NULL, - NULL, NULL, NULL ); - - // Create control tree object for outer panel (to block-panel) - // problem (left side). - trsm_l_cntl_op_bp - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - trsm_l_packa_cntl, - trsm_l_packb_cntl, - NULL, - trsm_cntl_bp_ke, - gemm_cntl_bp_ke, - NULL ); - - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates (left side). - trsm_l_cntl_mm_op - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - trsm_l_cntl_op_bp, - NULL, - NULL ); - - // Create control tree object for very large problem via multiple - // general problems (left side). - trsm_l_cntl_vl_mm - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - trsm_l_cntl_mm_op, - NULL, - NULL ); - - // Create control tree object for outer panel (to block-panel) - // problem (right side). - trsm_r_cntl_op_bp - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - trsm_r_packa_cntl, - trsm_r_packb_cntl, - NULL, - trsm_cntl_bp_ke, - gemm_cntl_bp_ke, - NULL ); - - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates (right side). - trsm_r_cntl_mm_op - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - trsm_r_cntl_op_bp, - NULL, - NULL ); - - // Create control tree object for very large problem via multiple - // general problems (right side). - trsm_r_cntl_vl_mm - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - trsm_r_cntl_mm_op, - NULL, - NULL ); - - // Alias the "master" trsm control trees to shorter names. - trsm_l_cntl = trsm_l_cntl_vl_mm; - trsm_r_cntl = trsm_r_cntl_vl_mm; + if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create(); + else return bli_trsm_r_cntl_create(); } -void bli_trsm_cntl_finalize() +cntl_t* bli_trsm_l_cntl_create + ( + void + ) { - bli_cntl_obj_free( trsm_l_packa_cntl ); - bli_cntl_obj_free( trsm_l_packb_cntl ); - bli_cntl_obj_free( trsm_r_packa_cntl ); - bli_cntl_obj_free( trsm_r_packb_cntl ); + void* macro_kernel_p = bli_trsm_xx_ker_var2; - bli_cntl_obj_free( trsm_cntl_bp_ke ); + // Create a node for the macro-kernel. + cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + ( + BLIS_NR, // bszid not used by macro-kernel. + macro_kernel_p, + NULL // no sub-node; this is the leaf of the tree. + ); - bli_cntl_obj_free( trsm_l_cntl_op_bp ); - bli_cntl_obj_free( trsm_l_cntl_mm_op ); - bli_cntl_obj_free( trsm_l_cntl_vl_mm ); - bli_cntl_obj_free( trsm_r_cntl_op_bp ); - bli_cntl_obj_free( trsm_r_cntl_mm_op ); - bli_cntl_obj_free( trsm_r_cntl_vl_mm ); + // Create a node for packing matrix A. + cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_trsm_packa, + bli_packm_blk_var1, + BLIS_MR, + BLIS_MR, + TRUE, // do NOT invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + trsm_cntl_bp_ke + ); + + // Create a node for partitioning the m dimension by MC. + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + ( + BLIS_MC, + bli_trsm_blk_var1, + trsm_cntl_packa + ); + + // Create a node for packing matrix B. + cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_trsm_packb, + bli_packm_blk_var1, + BLIS_MR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + trsm_cntl_op_bp + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + ( + BLIS_KC, + bli_trsm_blk_var3, + trsm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + ( + BLIS_NC, + bli_trsm_blk_var2, + trsm_cntl_mm_op + ); + + return trsm_cntl_vl_mm; } -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_packm_a, - packm_t* sub_packm_b, - packm_t* sub_packm_c, - trsm_t* sub_trsm, - gemm_t* sub_gemm, - unpackm_t* sub_unpackm_c ) +cntl_t* bli_trsm_r_cntl_create + ( + void + ) { - trsm_t* cntl; + void* macro_kernel_p = bli_trsm_xx_ker_var2; - cntl = ( trsm_t* ) bli_malloc_intl( sizeof(trsm_t) ); + // Create a node for the macro-kernel. + cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + ( + BLIS_NR, // bszid not used by macro-kernel. + macro_kernel_p, + NULL // no sub-node; this is the leaf of the tree. + ); - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bszid = bszid; - cntl->sub_scalm = sub_scalm; - cntl->sub_packm_a = sub_packm_a; - cntl->sub_packm_b = sub_packm_b; - cntl->sub_packm_c = sub_packm_c; - cntl->sub_trsm = sub_trsm; - cntl->sub_gemm = sub_gemm; - cntl->sub_unpackm_c = sub_unpackm_c; + // Create a node for packing matrix A. + cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_trsm_packa, + bli_packm_blk_var1, + BLIS_NR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + trsm_cntl_bp_ke + ); - return cntl; + // Create a node for partitioning the m dimension by MC. + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + ( + BLIS_MC, + bli_trsm_blk_var1, + trsm_cntl_packa + ); + + // Create a node for packing matrix B. + cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_trsm_packb, + bli_packm_blk_var1, + BLIS_MR, + BLIS_MR, + TRUE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + trsm_cntl_op_bp + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + ( + BLIS_KC, + bli_trsm_blk_var3, + trsm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + ( + BLIS_NC, + bli_trsm_blk_var2, + trsm_cntl_mm_op + ); + + return trsm_cntl_vl_mm; +} + +void bli_trsm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) +{ + bli_cntl_free( cntl, thread ); +} + +// ----------------------------------------------------------------------------- + +cntl_t* bli_trsm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ) +{ + return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 651cc8599..6dbe9adce 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -32,33 +32,33 @@ */ -struct trsm_s -{ - impl_t impl_type; - varnum_t var_num; - bszid_t bszid; - struct scalm_s* sub_scalm; - struct packm_s* sub_packm_a; - struct packm_s* sub_packm_b; - struct packm_s* sub_packm_c; - struct trsm_s* sub_trsm; - struct gemm_s* sub_gemm; - struct unpackm_s* sub_unpackm_c; -}; -typedef struct trsm_s trsm_t; +cntl_t* bli_trsm_cntl_create + ( + side_t side + ); -#define bli_cntl_sub_trsm( cntl ) cntl->sub_trsm +cntl_t* bli_trsm_l_cntl_create + ( + void + ); -void bli_trsm_cntl_init( void ); -void bli_trsm_cntl_finalize( void ); -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_pack_a, - packm_t* sub_pack_b, - packm_t* sub_pack_c, - trsm_t* sub_trsm, - gemm_t* sub_gemm, - unpackm_t* sub_unpack_c ); +cntl_t* bli_trsm_r_cntl_create + ( + void + ); + +void bli_trsm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +// ----------------------------------------------------------------------------- + +cntl_t* bli_trsm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 53cdbb1c8..3466d2d18 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -34,15 +34,16 @@ #include "blis.h" -void bli_trsm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl ) +void bli_trsm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) { - trsm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; @@ -60,7 +61,7 @@ void bli_trsm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -115,26 +116,27 @@ void bli_trsm_front( side_t side, bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Choose the control tree. - if ( bli_is_left( side ) ) cntl = l_cntl; - else cntl = r_cntl; + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRSM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trsm_int, - alpha, - &a_local, - &b_local, - alpha, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side ); + dim_t n_threads = bli_thread_num_threads( infos[0] ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + n_threads, + bli_trsm_int, + alpha, + &a_local, + &b_local, + alpha, + &c_local, + cntx, + cntl, + infos + ); + bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index 6ee063797..84feef22f 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -32,11 +32,12 @@ */ -void bli_trsm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl ); - +void bli_trsm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index 123ef6585..e6614cb3f 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -34,73 +34,22 @@ #include "blis.h" -#define FUNCPTR_T trsm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][2][4][3] = -{ - // left - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1f }, - { NULL, bli_trsm_ll_ker_var2, bli_trsm_blk_var2f }, - { NULL, NULL, bli_trsm_blk_var3f }, - { NULL, NULL, NULL, }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1b }, - { NULL, bli_trsm_lu_ker_var2, bli_trsm_blk_var2b }, - { NULL, NULL, bli_trsm_blk_var3b }, - { NULL, NULL, NULL, }, - } - }, - // right - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1b }, - { NULL, bli_trsm_rl_ker_var2, bli_trsm_blk_var2b }, - { NULL, NULL, bli_trsm_blk_var3b }, - { NULL, NULL, NULL, }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1f }, - { NULL, bli_trsm_ru_ker_var2, bli_trsm_blk_var2f }, - { NULL, NULL, bli_trsm_blk_var3f }, - { NULL, NULL, NULL, }, - } - } -}; - -void bli_trsm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; - bool_t side, uplo; - varnum_t n; - impl_t i; - FUNCPTR_T f; + trsm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -113,9 +62,9 @@ void bli_trsm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - if( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_obarrier( thread ); return; } @@ -133,14 +82,14 @@ void bli_trsm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -148,24 +97,15 @@ void bli_trsm_int( obj_t* alpha, // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( *a ) ) { - side = 0; - if ( bli_obj_root_is_lower( *a ) ) uplo = 0; - else uplo = 1; - // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( *b ) ) { - side = 1; - // Set a bool based on the uplo field of A's root object. - if ( bli_obj_root_is_lower( *b ) ) uplo = 0; - else uplo = 1; - // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) @@ -174,21 +114,21 @@ void bli_trsm_int( obj_t* alpha, } } - bli_thread_obarrier( thread ); + // FGVZ->TMS: Is this barrier still needed? + bli_thread_obarrier( thread ); - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo][n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/trsm/bli_trsm_int.h index deecc6565..a147a3298 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/trsm/bli_trsm_int.h @@ -32,11 +32,15 @@ */ -void bli_trsm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ); +void bli_trsm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); + diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index db4668d1f..b7d695318 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); -void bli_trsm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_ll_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to B. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of B prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 17041d986..763592644 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); -void bli_trsm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_lu_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to B. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of B prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/3/trsm/bli_trsm_packab.c new file mode 100644 index 000000000..3a32ce097 --- /dev/null +++ b/frame/3/trsm/bli_trsm_packab.c @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack; + + // Pack matrix A according to the control tree node. + bli_l3_packm + ( + a, + &a_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_trsm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + +// ----------------------------------------------------------------------------- + +void bli_trsm_packb + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b_pack; + + // Pack matrix B according to the control tree node. + bli_l3_packm + ( + b, + &b_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix B. + bli_trsm_int + ( + &BLIS_ONE, + a, + &b_pack, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 40f3d5511..a18e88939 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); -void bli_trsm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_rl_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to A. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of A prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 6482fa777..f5dad161b 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); -void bli_trsm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_ru_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to A. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of A prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 2a2c0efc8..2ff45fa13 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -46,17 +46,17 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - trsm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( trsm_blk_var1f ) -GENPROT( trsm_blk_var1b ) -GENPROT( trsm_blk_var2f ) -GENPROT( trsm_blk_var2b ) -GENPROT( trsm_blk_var3f ) -GENPROT( trsm_blk_var3b ) +GENPROT( trsm_blk_var1 ) +GENPROT( trsm_blk_var2 ) +GENPROT( trsm_blk_var3 ) +GENPROT( trsm_packa ) +GENPROT( trsm_packb ) +GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) diff --git a/frame/1/packv/bli_packv_cntl.h b/frame/3/trsm/bli_trsm_xx_ker_var2.c similarity index 61% rename from frame/1/packv/bli_packv_cntl.h rename to frame/3/trsm/bli_trsm_xx_ker_var2.c index d4682f085..8409432ca 100644 --- a/frame/1/packv/bli_packv_cntl.h +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -32,32 +32,56 @@ */ -struct packv_s +#include "blis.h" + +static trsm_voft vars[2][2] = { - impl_t impl_type; - varnum_t var_num; - bszid_t bmid; - pack_t pack_schema; + { bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 }, + { bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 } }; -typedef struct packv_s packv_t; -#define cntl_bmid( cntl ) cntl->bmid +void bli_trsm_xx_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + bool_t side; + bool_t uplo; + trsm_voft f; -#define bli_cntl_sub_packv( cntl ) cntl->sub_packv -#define bli_cntl_sub_packv_x( cntl ) cntl->sub_packv_x -#define bli_cntl_sub_packv_x1( cntl ) cntl->sub_packv_x1 -#define bli_cntl_sub_packv_y( cntl ) cntl->sub_packv_y -#define bli_cntl_sub_packv_y1( cntl ) cntl->sub_packv_y1 + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( *a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( *a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( *b ) ) uplo = 0; + else uplo = 1; + } -void bli_packv_cntl_init( void ); -void bli_packv_cntl_finalize( void ); -packv_t* bli_packv_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid, - pack_t pack_schema ); -void bli_packv_cntl_obj_init( packv_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid, - pack_t pack_schema ); + // Index into the variant array to extract the correct function pointer. + f = vars[side][uplo]; + + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread + ); +} diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/old/bli_trsm_blk_var1b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var1b.c rename to frame/3/trsm/old/bli_trsm_blk_var1b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var1b.h b/frame/3/trsm/old/bli_trsm_blk_var1b.h deleted file mode 100644 index 77601bb76..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var1b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var1b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_blk_var1f.h b/frame/3/trsm/old/bli_trsm_blk_var1f.h deleted file mode 100644 index df5a9d3fd..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var1f.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/old/bli_trsm_blk_var2b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var2b.c rename to frame/3/trsm/old/bli_trsm_blk_var2b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var2b.h b/frame/3/trsm/old/bli_trsm_blk_var2b.h deleted file mode 100644 index d890990e7..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var2b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var2b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/old/bli_trsm_blk_var3b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var3b.c rename to frame/3/trsm/old/bli_trsm_blk_var3b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var3b.h b/frame/3/trsm/old/bli_trsm_blk_var3b.h deleted file mode 100644 index 5cab7bdcf..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var3b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var3b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_blk_var3f.h b/frame/3/trsm/old/bli_trsm_blk_var3f.h deleted file mode 100644 index 2c6fbb214..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var3f.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_cntl.c b/frame/3/trsm/old/bli_trsm_cntl.c new file mode 100644 index 000000000..3a83faafd --- /dev/null +++ b/frame/3/trsm/old/bli_trsm_cntl.c @@ -0,0 +1,268 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern gemm_t* gemm_cntl_bp_ke; + +packm_t* trsm_l_packa_cntl = NULL; +packm_t* trsm_l_packb_cntl = NULL; + +packm_t* trsm_r_packa_cntl = NULL; +packm_t* trsm_r_packb_cntl = NULL; + +trsm_t* trsm_cntl_bp_ke = NULL; + +trsm_t* trsm_l_cntl_op_bp = NULL; +trsm_t* trsm_l_cntl_mm_op = NULL; +trsm_t* trsm_l_cntl_vl_mm = NULL; + +trsm_t* trsm_r_cntl_op_bp = NULL; +trsm_t* trsm_r_cntl_mm_op = NULL; +trsm_t* trsm_r_cntl_vl_mm = NULL; + +trsm_t* trsm_l_cntl = NULL; +trsm_t* trsm_r_cntl = NULL; + + +void bli_trsm_cntl_init() +{ + + // Create control tree objects for packm operations (left side). + trsm_l_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + // IMPORTANT: n dim multiple must be mr to + // support right and bottom-right edge cases + BLIS_MR, + BLIS_MR, + TRUE, // invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm_l_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + // IMPORTANT: m dim multiple must be mr since + // B_pack is updated (ie: serves as C) in trsm + BLIS_MR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (right side). + trsm_r_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_NR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm_r_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, // pack panels of B compactly + BLIS_MR, + BLIS_MR, + TRUE, // invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + trsm_cntl_bp_ke + = + bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + 0, // bszid_t not used by macro-kernel + NULL, NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (left side). + trsm_l_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_MC, + NULL, + trsm_l_packa_cntl, + trsm_l_packb_cntl, + NULL, + trsm_cntl_bp_ke, + gemm_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (left side). + trsm_l_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + BLIS_KC, + NULL, + NULL, + NULL, + NULL, + trsm_l_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (left side). + trsm_l_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + BLIS_NC, + NULL, + NULL, + NULL, + NULL, + trsm_l_cntl_mm_op, + NULL, + NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (right side). + trsm_r_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_MC, + NULL, + trsm_r_packa_cntl, + trsm_r_packb_cntl, + NULL, + trsm_cntl_bp_ke, + gemm_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (right side). + trsm_r_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + BLIS_KC, + NULL, + NULL, + NULL, + NULL, + trsm_r_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (right side). + trsm_r_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + BLIS_NC, + NULL, + NULL, + NULL, + NULL, + trsm_r_cntl_mm_op, + NULL, + NULL ); + + // Alias the "master" trsm control trees to shorter names. + trsm_l_cntl = trsm_l_cntl_vl_mm; + trsm_r_cntl = trsm_r_cntl_vl_mm; +} + +void bli_trsm_cntl_finalize() +{ + bli_cntl_obj_free( trsm_l_packa_cntl ); + bli_cntl_obj_free( trsm_l_packb_cntl ); + bli_cntl_obj_free( trsm_r_packa_cntl ); + bli_cntl_obj_free( trsm_r_packb_cntl ); + + bli_cntl_obj_free( trsm_cntl_bp_ke ); + + bli_cntl_obj_free( trsm_l_cntl_op_bp ); + bli_cntl_obj_free( trsm_l_cntl_mm_op ); + bli_cntl_obj_free( trsm_l_cntl_vl_mm ); + bli_cntl_obj_free( trsm_r_cntl_op_bp ); + bli_cntl_obj_free( trsm_r_cntl_mm_op ); + bli_cntl_obj_free( trsm_r_cntl_vl_mm ); +} + +trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, + varnum_t var_num, + bszid_t bszid, + scalm_t* sub_scalm, + packm_t* sub_packm_a, + packm_t* sub_packm_b, + packm_t* sub_packm_c, + trsm_t* sub_trsm, + gemm_t* sub_gemm, + unpackm_t* sub_unpackm_c ) +{ + trsm_t* cntl; + + cntl = ( trsm_t* ) bli_malloc_intl( sizeof(trsm_t) ); + + cntl->impl_type = impl_type; + cntl->var_num = var_num; + cntl->bszid = bszid; + cntl->sub_scalm = sub_scalm; + cntl->sub_packm_a = sub_packm_a; + cntl->sub_packm_b = sub_packm_b; + cntl->sub_packm_c = sub_packm_c; + cntl->sub_trsm = sub_trsm; + cntl->sub_gemm = sub_gemm; + cntl->sub_unpackm_c = sub_unpackm_c; + + return cntl; +} + diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.h b/frame/3/trsm/old/bli_trsm_cntl.h similarity index 61% rename from frame/1m/unpackm/bli_unpackm_blk_var2.h rename to frame/3/trsm/old/bli_trsm_cntl.h index 1f783260a..651cc8599 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.h +++ b/frame/3/trsm/old/bli_trsm_cntl.h @@ -32,30 +32,33 @@ */ -void bli_unpackm_blk_var2( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ); +struct trsm_s +{ + impl_t impl_type; + varnum_t var_num; + bszid_t bszid; + struct scalm_s* sub_scalm; + struct packm_s* sub_packm_a; + struct packm_s* sub_packm_b; + struct packm_s* sub_packm_c; + struct trsm_s* sub_trsm; + struct gemm_s* sub_gemm; + struct unpackm_s* sub_unpackm_c; +}; +typedef struct trsm_s trsm_t; +#define bli_cntl_sub_trsm( cntl ) cntl->sub_trsm -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_panel, \ - dim_t n_panel, \ - void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC( unpackm_blk_var2 ) +void bli_trsm_cntl_init( void ); +void bli_trsm_cntl_finalize( void ); +trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, + varnum_t var_num, + bszid_t bszid, + scalm_t* sub_scalm, + packm_t* sub_pack_a, + packm_t* sub_pack_b, + packm_t* sub_pack_c, + trsm_t* sub_trsm, + gemm_t* sub_gemm, + unpackm_t* sub_unpack_c ); diff --git a/frame/3/trsm/old/bli_trsm_ll_ker_var2.h b/frame/3/trsm/old/bli_trsm_ll_ker_var2.h deleted file mode 100644 index 09812df14..000000000 --- a/frame/3/trsm/old/bli_trsm_ll_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_lu_ker_var2.h b/frame/3/trsm/old/bli_trsm_lu_ker_var2.h deleted file mode 100644 index aa7c8ed47..000000000 --- a/frame/3/trsm/old/bli_trsm_lu_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_rl_ker_var2.h b/frame/3/trsm/old/bli_trsm_rl_ker_var2.h deleted file mode 100644 index 0fd7e6bbe..000000000 --- a/frame/3/trsm/old/bli_trsm_rl_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_ru_ker_var2.h b/frame/3/trsm/old/bli_trsm_ru_ker_var2.h deleted file mode 100644 index a30e20070..000000000 --- a/frame/3/trsm/old/bli_trsm_ru_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_thread.c b/frame/3/trsm/old/bli_trsm_thread.c deleted file mode 100644 index 1a9f4ec16..000000000 --- a/frame/3/trsm/old/bli_trsm_thread.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_trsm_thrinfo_create_paths( bool_t right_sided ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); - - if ( right_sided ) - { - ic_way = jc_way * ic_way * jr_way; - - jc_way = 1; - kc_way = 1; - jr_way = 1; - ir_way = 1; - } - else - { - jr_way = ic_way * jr_way * ir_way; - - jc_way = 1; - kc_way = 1; - ic_way = 1; - ir_way = 1; - } -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h new file mode 100644 index 000000000..aee1869a0 --- /dev/null +++ b/frame/base/bli_auxinfo.h @@ -0,0 +1,70 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_AUXINFO_MACRO_DEFS_H +#define BLIS_AUXINFO_MACRO_DEFS_H + + +// auxinfo_t field query + +#define bli_auxinfo_schema_a( auxinfo ) ( (auxinfo)->schema_a ) +#define bli_auxinfo_schema_b( auxinfo ) ( (auxinfo)->schema_b ) + +#define bli_auxinfo_next_a( auxinfo ) ( (auxinfo)->a_next ) +#define bli_auxinfo_next_b( auxinfo ) ( (auxinfo)->b_next ) + +#define bli_auxinfo_is_a( auxinfo ) ( (auxinfo)->is_a ) +#define bli_auxinfo_is_b( auxinfo ) ( (auxinfo)->is_b ) + + +// auxinfo_t field modification + +#define bli_auxinfo_set_schema_a( schema, auxinfo ) { (auxinfo).schema_a = schema; } +#define bli_auxinfo_set_schema_b( schema, auxinfo ) { (auxinfo).schema_b = schema; } + +#define bli_auxinfo_set_next_a( a_p, auxinfo ) { (auxinfo).a_next = a_p; } +#define bli_auxinfo_set_next_b( b_p, auxinfo ) { (auxinfo).b_next = b_p; } + +#define bli_auxinfo_set_next_ab( a_p, b_p, auxinfo ) \ +{ \ + bli_auxinfo_set_next_a( a_p, auxinfo ); \ + bli_auxinfo_set_next_b( b_p, auxinfo ); \ +} + +#define bli_auxinfo_set_is_a( is, auxinfo ) { (auxinfo).is_a = is; } +#define bli_auxinfo_set_is_b( is, auxinfo ) { (auxinfo).is_b = is; } + + +#endif + diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index e7bd0be2a..833dadb42 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -35,10 +35,13 @@ #include "blis.h" -blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ) +blksz_t* bli_blksz_obj_create + ( + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ) { blksz_t* b; @@ -53,11 +56,14 @@ blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, return b; } -void bli_blksz_obj_init( blksz_t* b, - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ) +void bli_blksz_obj_init + ( + blksz_t* b, + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ) { b->v[BLIS_FLOAT] = b_s; b->v[BLIS_DOUBLE] = b_d; @@ -69,15 +75,21 @@ void bli_blksz_obj_init( blksz_t* b, b->e[BLIS_DCOMPLEX] = be_z; } -void bli_blksz_obj_free( blksz_t* b ) +void bli_blksz_obj_free + ( + blksz_t* b + ) { bli_free_intl( b ); } // ----------------------------------------------------------------------------- -void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz ) +void bli_blksz_reduce_dt_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) { dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); @@ -107,11 +119,30 @@ void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, // ----------------------------------------------------------------------------- -dim_t bli_determine_blocksize_f( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ) +dim_t bli_determine_blocksize + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + if ( direct == BLIS_FWD ) + return bli_determine_blocksize_f( i, dim, obj, bszid, cntx ); + else + return bli_determine_blocksize_b( i, dim, obj, bszid, cntx ); +} + +dim_t bli_determine_blocksize_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) { num_t dt; blksz_t* bsize; @@ -130,10 +161,39 @@ dim_t bli_determine_blocksize_f( dim_t i, return b_use; } -dim_t bli_determine_blocksize_f_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ) +dim_t bli_determine_blocksize_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + num_t dt; + blksz_t* bsize; + dim_t b_alg, b_max; + dim_t b_use; + + // Extract the execution datatype and use it to query the corresponding + // blocksize and blocksize maximum values from the blksz_t object. + dt = bli_obj_execution_datatype( *obj ); + bsize = bli_cntx_get_blksz( bszid, cntx ); + b_alg = bli_blksz_get_def( dt, bsize ); + b_max = bli_blksz_get_max( dt, bsize ); + + b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); + + return b_use; +} + +dim_t bli_determine_blocksize_f_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ) { dim_t b_now; dim_t dim_left_now; @@ -161,33 +221,13 @@ dim_t bli_determine_blocksize_f_sub( dim_t i, return b_now; } -dim_t bli_determine_blocksize_b( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ) -{ - num_t dt; - blksz_t* bsize; - dim_t b_alg, b_max; - dim_t b_use; - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_determine_blocksize_b_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ) +dim_t bli_determine_blocksize_b_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ) { dim_t b_now; dim_t dim_left_now; diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index 11a8cb650..daffb3772 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -97,43 +97,79 @@ // ----------------------------------------------------------------------------- -blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ); +blksz_t* bli_blksz_obj_create + ( + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ); -void bli_blksz_obj_init( blksz_t* b, - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ); +void bli_blksz_obj_init + ( + blksz_t* b, + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ); -void bli_blksz_obj_free( blksz_t* b ); +void bli_blksz_obj_free + ( + blksz_t* b + ); // ----------------------------------------------------------------------------- -void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz ); +void bli_blksz_reduce_dt_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); // ----------------------------------------------------------------------------- -dim_t bli_determine_blocksize_f( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ); -dim_t bli_determine_blocksize_f_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ); +dim_t bli_determine_blocksize + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); -dim_t bli_determine_blocksize_b( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ); -dim_t bli_determine_blocksize_b_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ); +dim_t bli_determine_blocksize_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_f_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ); + +dim_t bli_determine_blocksize_b_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c new file mode 100644 index 000000000..3b39befe4 --- /dev/null +++ b/frame/base/bli_cntl.c @@ -0,0 +1,186 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +cntl_t* bli_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + void* params, + cntl_t* sub_node + ) +{ + cntl_t* cntl; + mem_t* pack_mem; + + // Allocate the cntl_t struct. + cntl = bli_malloc_intl( sizeof( cntl_t ) ); + + bli_cntl_set_bszid( bszid, cntl ); + bli_cntl_set_var_func( var_func, cntl ); + bli_cntl_set_params( params, cntl ); + bli_cntl_set_sub_node( sub_node, cntl ); + + // Query the address of the node's packed mem_t entry so we can initialize + // key fields (to NULL or 0). + // NOTE: This initialization is important, since it allows threads to + // discern whether blocks have been acquired from the memory allocator. + pack_mem = bli_cntl_pack_mem( cntl ); + bli_mem_clear( pack_mem ); + + return cntl; +} + +void bli_cntl_obj_free + ( + cntl_t* cntl + ) +{ + bli_free_intl( cntl ); +} + +void bli_cntl_obj_clear + ( + cntl_t* cntl + ) +{ + mem_t* pack_mem; + + // Clear various fields in the control tree. Clearing these fields + // actually is not needed, but we do it for debugging/completeness. + bli_cntl_set_var_func( NULL, cntl ); + bli_cntl_set_params( NULL, cntl ); + bli_cntl_set_sub_node( NULL, cntl ); + + // Clearing these fields is potentially more important if the control + // tree is cached somewhere and reused. + pack_mem = bli_cntl_pack_mem( cntl ); + bli_mem_clear( pack_mem ); +} + +// ----------------------------------------------------------------------------- + +void bli_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Base case: simply return when asked to free NULL nodes. + if ( cntl == NULL ) return; + + cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); + void* cntl_params = bli_cntl_params( cntl ); + mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); + + thrinfo_t* thread_sub_node = bli_thrinfo_sub_node( thread ); + + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free( cntl_sub_node, thread_sub_node ); + + // Free the current node's params field, if it is non-NULL. + if ( cntl_params != NULL ) + { + bli_free_intl( cntl_params ); + } + + // Release the current node's pack mem_t entry back to the memory + // broker from which it originated, but only if the current thread + // is chief for its group, and only if the mem_t is allocated. + if ( bli_thread_am_ochief( thread ) ) + if ( bli_mem_is_alloc( cntl_pack_mem ) ) + { + bli_membrk_release( cntl_pack_mem ); + } + + // Free the current node. + bli_cntl_obj_free( cntl ); +} + +// ----------------------------------------------------------------------------- + +cntl_t* bli_cntl_copy + ( + cntl_t* cntl + ) +{ + // Make a copy of the current node. Notice that the source node + // should NOT have any allocated/cached mem_t entries, and that + // bli_cntl_obj_create() creates a node with a cleared mem_t + // field. + cntl_t* cntl_copy = bli_cntl_obj_create + ( + bli_cntl_bszid( cntl ), + bli_cntl_var_func( cntl ), + NULL, NULL + ); + + // Check the params field of the existing control tree; if it's non-NULL, + // copy it. + if ( bli_cntl_params( cntl ) != NULL ) + { + // Detect the size of the params struct by reading the first field + // as a uint64_t, and then allocate this many bytes for a new params + // struct. + uint64_t params_size = bli_cntl_params_size( cntl ); + void* params_orig = bli_cntl_params( cntl ); + void* params_copy = bli_malloc_intl( ( size_t )params_size ); + + // Copy the original params struct to the new memory region. + memcpy( params_copy, params_orig, params_size ); + + // Save the address of the new params struct into the new control + // tree node. + bli_cntl_set_params( params_copy, cntl_copy ); + } + + // If the sub-node exists, copy it recursively. + if ( bli_cntl_sub_node( cntl ) != NULL ) + { + cntl_t* sub_node_copy = bli_cntl_copy + ( + bli_cntl_sub_node( cntl ) + ); + + // Save the address of the new sub-node (sub-tree) to the existing + // node. + bli_cntl_set_sub_node( sub_node_copy, cntl_copy ); + } + + // Return the address of the newly created node. + return cntl_copy; +} + diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h new file mode 100644 index 000000000..7b6000bb9 --- /dev/null +++ b/frame/base/bli_cntl.h @@ -0,0 +1,153 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +/* +// -- Control tree node definition -- + +struct cntl_s +{ + // Basic fields (usually required). + bszid_t bszid; + void* var_func; + struct cntl_s* sub_node; + + // Optional fields (needed only by some operations such as packm). + // NOTE: first field of params must be a uint64_t containing the size + // of the struct. + void* params; + + // Internal fields that track "cached" data. + mem_t pack_mem; +}; +typedef struct cntl_s cntl_t; +*/ + + +// -- Control tree prototypes -- + +cntl_t* bli_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + void* params, + cntl_t* sub_node + ); + +void bli_cntl_obj_free + ( + cntl_t* cntl + ); + +void bli_cntl_obj_clear + ( + cntl_t* cntl + ); + +void bli_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +cntl_t* bli_cntl_copy + ( + cntl_t* cntl + ); + +// ----------------------------------------------------------------------------- + +// cntl_t query (fields only) + +#define bli_cntl_bszid( cntl ) \ +\ + ( cntl->bszid ) + +#define bli_cntl_var_func( cntl ) \ +\ + ( cntl->var_func ) + +#define bli_cntl_sub_node( cntl ) \ +\ + ( cntl->sub_node ) + +#define bli_cntl_params( cntl ) \ +\ + ( cntl->params ) + +#define bli_cntl_params_size( cntl ) \ +\ + ( *( ( uint64_t* )(cntl->params) ) ) + +#define bli_cntl_pack_mem( cntl ) \ +\ + ( &(cntl->pack_mem) ) + +// cntl_t query (complex) + +#define bli_cntl_is_leaf( cntl ) \ +\ + ( bli_cntl_sub_node( cntl ) == NULL ) + +#define bli_cntl_does_part( cntl ) \ +\ + ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ) + +// cntl_t modification + +#define bli_cntl_set_bszid( bszid0, cntl ) \ +{ \ + cntl->bszid = bszid0; \ +} + +#define bli_cntl_set_var_func( var_func0, cntl ) \ +{ \ + cntl->var_func = var_func0; \ +} + +#define bli_cntl_set_sub_node( sub_node0, cntl ) \ +{ \ + cntl->sub_node = sub_node0; \ +} + +#define bli_cntl_set_params( params0, cntl ) \ +{ \ + cntl->params = params0; \ +} + +#define bli_cntl_set_pack_mem( pack_mem0, cntl ) \ +{ \ + cntl->pack_mem = *(pack_mem0); \ +} + diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index d06167a07..f2885cca3 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -713,6 +713,36 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); +} + +bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + num_t dt = bli_obj_datatype( *obj ); + + // Reference the ukr storage preferences of the corresponding real + // micro-kernel for induced methods. + if ( bli_cntx_get_ind_method( cntx ) != BLIS_NAT ) + dt = bli_obj_datatype_proj_to_real( *obj ); + + const bool_t ukr_prefers_rows + = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); + const bool_t ukr_prefers_cols + = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); + bool_t r_val = FALSE; + + if ( bli_obj_is_row_stored( *obj ) && ukr_prefers_cols ) r_val = TRUE; + else if ( bli_obj_is_col_stored( *obj ) && ukr_prefers_rows ) r_val = TRUE; + + return r_val; +} + // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) @@ -803,6 +833,12 @@ void bli_cntx_print( cntx_t* cntx ) ); } + { + ind_t family = bli_cntx_get_family( cntx ); + + printf( "oper family : %lu\n", ( guint_t )family ); + } + { ind_t method = bli_cntx_get_ind_method( cntx ); @@ -810,18 +846,3 @@ void bli_cntx_print( cntx_t* cntx ) } } - - - - - - - - - - - - - - - diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 337d233b3..21f9c0fe0 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -53,6 +53,7 @@ typedef struct cntx_s func_t packm_ukrs; + opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; @@ -102,6 +103,10 @@ typedef struct cntx_s \ (&((cntx)->packm_ukrs) ) +#define bli_cntx_family( cntx ) \ +\ + ( (cntx)->family ) + #define bli_cntx_method( cntx ) \ \ ( (cntx)->method ) @@ -164,6 +169,11 @@ typedef struct cntx_s (cntx_p)->packm_ukrs = _packm_ukrs; \ } +#define bli_cntx_set_family( _family, cntx_p ) \ +{ \ + (cntx_p)->family = _family; \ +} + #define bli_cntx_set_method( _method, cntx_p ) \ { \ (cntx_p)->method = _method; \ @@ -263,6 +273,10 @@ typedef struct cntx_s (dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \ ) +#define bli_cntx_get_family( cntx ) \ +\ + bli_cntx_family( cntx ) + #define bli_cntx_get_ind_method( cntx ) \ \ bli_cntx_method( cntx ) @@ -391,6 +405,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); // print function diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 6ae0f461e..7f3f897d5 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -754,6 +754,9 @@ static func_t bli_gks_l1v_kers[BLIS_NUM_LEVEL1V_KERS] = /* addv */ { { BLIS_SADDV_KERNEL, BLIS_CADDV_KERNEL, BLIS_DADDV_KERNEL, BLIS_ZADDV_KERNEL, } }, +/* amaxv */ { { BLIS_SAMAXV_KERNEL, BLIS_CAMAXV_KERNEL, + BLIS_DAMAXV_KERNEL, BLIS_ZAMAXV_KERNEL, } + }, /* axpbyv */ { { BLIS_SAXPBYV_KERNEL, BLIS_CAXPBYV_KERNEL, BLIS_DAXPBYV_KERNEL, BLIS_ZAXPBYV_KERNEL, } }, @@ -798,6 +801,9 @@ static func_t bli_gks_l1v_ref_kers[BLIS_NUM_LEVEL1V_KERS] = /* addv */ { { BLIS_SADDV_KERNEL_REF, BLIS_CADDV_KERNEL_REF, BLIS_DADDV_KERNEL_REF, BLIS_ZADDV_KERNEL_REF, } }, +/* amaxv */ { { BLIS_SAMAXV_KERNEL_REF, BLIS_CAMAXV_KERNEL_REF, + BLIS_DAMAXV_KERNEL_REF, BLIS_ZAMAXV_KERNEL_REF, } + }, /* axpbyv */ { { BLIS_SAXPBYV_KERNEL_REF, BLIS_CAXPBYV_KERNEL_REF, BLIS_DAXPBYV_KERNEL_REF, BLIS_ZAXPBYV_KERNEL_REF, } }, diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index fede4f823..4c63b604d 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -114,9 +114,9 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) // -- Memory pool-related ------------------------------------------------------ -gint_t bli_info_get_mk_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_A_BLOCK ); } -gint_t bli_info_get_kn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_B_PANEL ); } -gint_t bli_info_get_mn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_C_PANEL ); } +gint_t bli_info_get_mk_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_A_BLOCK ); } +gint_t bli_info_get_kn_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_B_PANEL ); } +gint_t bli_info_get_mn_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_C_PANEL ); } diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 6e793fa40..db598cede 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -83,17 +83,14 @@ err_t bli_init( void ) { // Initialize various sub-APIs. bli_const_init(); - bli_cntl_init(); bli_error_init(); - bli_mem_init(); + bli_memsys_init(); bli_ind_init(); bli_thread_init(); // After initialization is complete, mark BLIS as initialized. bli_is_init = TRUE; - //bli_mem_init(); - // Only the thread that actually performs the initialization will // return "success". r_val = BLIS_SUCCESS; @@ -150,9 +147,8 @@ err_t bli_finalize( void ) { // Finalize various sub-APIs. bli_const_finalize(); - bli_cntl_finalize(); bli_error_finalize(); - bli_mem_finalize(); + bli_memsys_finalize(); bli_ind_finalize(); bli_thread_finalize(); diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index 9ef741934..82bd88afb 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -36,18 +36,91 @@ #ifndef BLIS_MEM_H #define BLIS_MEM_H -// ----------------------------------------------------------------------------- -membrk_t* bli_mem_global_membrk( void ); -siz_t bli_mem_pool_size( packbuf_t buf_type ); +// Mem entry query -// ----------------------------------------------------------------------------- +#define bli_mem_pblk( mem_p ) \ +\ + ( &((mem_p)->pblk) ) -void bli_mem_init( void ); -void bli_mem_reinit( cntx_t* cntx ); -void bli_mem_finalize( void ); -bool_t bli_mem_is_initialized( void ); +#define bli_mem_buffer( mem_p ) \ +\ + ( bli_pblk_buf_align( bli_mem_pblk( mem_p ) ) ) + +#define bli_mem_buf_sys( mem_p ) \ +\ + ( bli_pblk_buf_sys( bli_mem_pblk( mem_p ) ) ) + +#define bli_mem_buf_type( mem_p ) \ +\ + ( (mem_p)->buf_type ) + +#define bli_mem_pool( mem_p ) \ +\ + ( (mem_p)->pool ) + +#define bli_mem_membrk( mem_p ) \ +\ + ( (mem_p)->membrk ) + +#define bli_mem_size( mem_p ) \ +\ + ( (mem_p)->size ) + +#define bli_mem_is_alloc( mem_p ) \ +\ + ( bli_mem_buffer( mem_p ) != NULL ) + +#define bli_mem_is_unalloc( mem_p ) \ +\ + ( bli_mem_buffer( mem_p ) == NULL ) -#endif +// Mem entry modification +#define bli_mem_set_pblk( pblk_p, mem_p ) \ +{ \ + mem_p->pblk = *(pblk_p); \ +} + +#define bli_mem_set_buffer( buf0, mem_p ) \ +{ \ + bli_pblk_set_buf_align( buf0, &(mem_p->pblk) ); \ +} + +#define bli_mem_set_buf_sys( buf0, mem_p ) \ +{ \ + bli_pblk_set_buf_sys( buf0, &(mem_p->pblk) ); \ +} + +#define bli_mem_set_buf_type( buf_type0, mem_p ) \ +{ \ + (mem_p)->buf_type = buf_type0; \ +} + +#define bli_mem_set_pool( pool0, mem_p ) \ +{ \ + (mem_p)->pool = pool0; \ +} + +#define bli_mem_set_membrk( membrk0, mem_p ) \ +{ \ + (mem_p)->membrk = membrk0; \ +} + +#define bli_mem_set_size( size0, mem_p ) \ +{ \ + mem_p->size = size0; \ +} + +#define bli_mem_clear( mem_p ) \ +{ \ + bli_mem_set_buffer( NULL, mem_p ); \ + bli_mem_set_buf_sys( NULL, mem_p ); \ + bli_mem_set_pool( NULL, mem_p ); \ + bli_mem_set_size( 0, mem_p ); \ + bli_mem_set_membrk( NULL, mem_p ); \ +} + + +#endif diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c new file mode 100644 index 000000000..e66aafa63 --- /dev/null +++ b/frame/base/bli_memsys.c @@ -0,0 +1,174 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS +pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +static membrk_t global_membrk; + +// ----------------------------------------------------------------------------- + +membrk_t* bli_memsys_global_membrk( void ) +{ + return &global_membrk; +} + +// ----------------------------------------------------------------------------- + +static bool_t bli_memsys_is_init = FALSE; + +void bli_memsys_init( void ) +{ + cntx_t cntx; + + // If the initialization flag is TRUE, we know the API is already + // initialized, so we can return early. + if ( bli_memsys_is_init == TRUE ) return; + + // Create and initialize a context for gemm so we have something + // to pass into bli_membrk_init_pools(). + bli_gemm_cntx_init( &cntx ); + +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // initialization actions once they are finally allowed into this + // critical section. + if ( bli_memsys_is_init == FALSE ) + { + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( &cntx, &global_membrk ); + + // After initialization, mark the API as initialized. + bli_memsys_is_init = TRUE; + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + + // Finalize the temporary gemm context. + bli_gemm_cntx_finalize( &cntx ); +} + +void bli_memsys_reinit( cntx_t* cntx ) +{ +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // If for some reason the memory pools have not yet been + // initialized (unlikely), we emulate the body of bli_memsys_init(). + if ( bli_memsys_is_init == FALSE ) + { + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( cntx, &global_membrk ); + + // After initialization, mark the API as initialized. + bli_memsys_is_init = TRUE; + } + else + { + // Reinitialize the global membrk_t object's memory pools. + bli_membrk_reinit_pools( cntx, &global_membrk ); + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif +} + +void bli_memsys_finalize( void ) +{ + // If the initialization flag is FALSE, we know the API is already + // uninitialized, so we can return early. + if ( bli_memsys_is_init == FALSE ) return; + +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // finalization actions once they are finally allowed into this + // critical section. + if ( bli_memsys_is_init == TRUE ) + { + // Finalize the global membrk_t object and its memory pools. + bli_membrk_finalize( &global_membrk ); + + // After finalization, mark the API as uninitialized. + bli_memsys_is_init = FALSE; + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif +} + +bool_t bli_memsys_is_initialized( void ) +{ + return bli_memsys_is_init; +} + diff --git a/frame/3/herk/old/bli_herk_blk_var1f.h b/frame/base/bli_memsys.h similarity index 77% rename from frame/3/herk/old/bli_herk_blk_var1f.h rename to frame/base/bli_memsys.h index bd1d8a95f..0a7b142a7 100644 --- a/frame/3/herk/old/bli_herk_blk_var1f.h +++ b/frame/base/bli_memsys.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,10 +33,20 @@ */ -void bli_herk_blk_var1f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); +#ifndef BLIS_MEMSYS_H +#define BLIS_MEMSYS_H + +// ----------------------------------------------------------------------------- + +membrk_t* bli_memsys_global_membrk( void ); + +// ----------------------------------------------------------------------------- + +void bli_memsys_init( void ); +void bli_memsys_reinit( cntx_t* cntx ); +void bli_memsys_finalize( void ); +bool_t bli_memsys_is_initialized( void ); + + +#endif diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 226b0747a..e1f05d075 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -65,7 +65,6 @@ void bli_obj_create_without_buffer( num_t dt, obj_t* obj ) { siz_t elem_size; - mem_t* pack_mem; void* s; if ( bli_error_checking_is_enabled() ) @@ -98,9 +97,6 @@ void bli_obj_create_without_buffer( num_t dt, bli_obj_set_offs( 0, 0, *obj ); bli_obj_set_diag_offset( 0, *obj ); - pack_mem = bli_obj_pack_mem( *obj ); - bli_mem_set_buffer( NULL, pack_mem ); - // Set the internal scalar to 1.0. s = bli_obj_internal_scalar_buffer( *obj ); @@ -467,8 +463,6 @@ num_t bli_datatype_union( num_t dt1, num_t dt2 ) void bli_obj_print( char* label, obj_t* obj ) { FILE* file = stdout; - mem_t* pack_mem = bli_obj_pack_mem( *obj ); - //mem_t* cast_mem = bli_obj_cast_mem( *obj ); if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); @@ -491,10 +485,6 @@ void bli_obj_print( char* label, obj_t* obj ) fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ), ( signed long int )bli_obj_col_stride( *obj ) ); fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( *obj ) ); - fprintf( file, " pack_mem \n" ); - fprintf( file, " - buf %p\n", ( void* )bli_mem_buffer( pack_mem ) ); - fprintf( file, " - buf_type %lu\n", ( unsigned long int )bli_mem_buf_type( pack_mem ) ); - fprintf( file, " - size %lu\n", ( unsigned long int )bli_mem_size( pack_mem ) ); fprintf( file, " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) ); fprintf( file, " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) ); fprintf( file, " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) ); diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index 64718353e..738284064 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -38,11 +38,31 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_mdim + ( + dir_t direct, + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( direct == BLIS_FWD ) + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); + else + bli_acquire_mpart_b2t( req_part, i, b, obj, sub_obj ); +} + + +void bli_acquire_mpart_t2b + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -59,14 +79,14 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_packm_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_t2b_check( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b_check( req_part, i, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -90,7 +110,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. - if ( requested_part == BLIS_SUBPART0 ) + if ( req_part == BLIS_SUBPART0 ) { // A0 (offm,offn) unchanged. // A0 is i x n. @@ -99,7 +119,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = i; n_part = n; } - else if ( requested_part == BLIS_SUBPART1T ) + else if ( req_part == BLIS_SUBPART1T ) { // A1T (offm,offn) unchanged. // A1T is (i+b) x n. @@ -108,7 +128,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = i + b; n_part = n; } - else if ( requested_part == BLIS_SUBPART1 ) + else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (i,0). // A1 is b x n. @@ -117,7 +137,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = b; n_part = n; } - else if ( requested_part == BLIS_SUBPART1B ) + else if ( req_part == BLIS_SUBPART1B ) { // A1B (offm,offn) += (i,0). // A1B is (m-i) x n. @@ -126,7 +146,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = m - i; n_part = n; } - else // if ( requested_part == BLIS_SUBPART2 ) + else // if ( req_part == BLIS_SUBPART2 ) { // A2 (offm,offn) += (i+b,0). // A2 is (m-i-b) x n. @@ -208,11 +228,14 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, } -void bli_acquire_mpart_b2t( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_b2t + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; @@ -222,15 +245,35 @@ void bli_acquire_mpart_b2t( subpart_t requested_part, // Modify i to account for the fact that we are moving backwards. i = m - i - b; - bli_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); } -void bli_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_ndim + ( + dir_t direct, + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( direct == BLIS_FWD ) + bli_acquire_mpart_l2r( req_part, i, b, obj, sub_obj ); + else + bli_acquire_mpart_r2l( req_part, i, b, obj, sub_obj ); +} + + +void bli_acquire_mpart_l2r + ( + subpart_t req_part, + dim_t j, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -247,14 +290,14 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_l2r( requested_part, j, b, obj, sub_obj ); + bli_packm_acquire_mpart_l2r( req_part, j, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_l2r_check( requested_part, j, b, obj, sub_obj ); + bli_acquire_mpart_l2r_check( req_part, j, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -278,7 +321,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. - if ( requested_part == BLIS_SUBPART0 ) + if ( req_part == BLIS_SUBPART0 ) { // A0 (offm,offn) unchanged. // A0 is m x j. @@ -287,7 +330,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = j; } - else if ( requested_part == BLIS_SUBPART1L ) + else if ( req_part == BLIS_SUBPART1L ) { // A1L (offm,offn) unchanged. // A1L is m x (j+b). @@ -296,7 +339,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = j + b; } - else if ( requested_part == BLIS_SUBPART1 ) + else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (0,j). // A1 is m x b. @@ -305,7 +348,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = b; } - else if ( requested_part == BLIS_SUBPART1R ) + else if ( req_part == BLIS_SUBPART1R ) { // A1R (offm,offn) += (0,j). // A1R is m x (n-j). @@ -314,7 +357,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = n - j; } - else // if ( requested_part == BLIS_SUBPART2 ) + else // if ( req_part == BLIS_SUBPART2 ) { // A2 (offm,offn) += (0,j+b). // A2 is m x (n-j-b). @@ -395,11 +438,14 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, } -void bli_acquire_mpart_r2l( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_r2l + ( + subpart_t req_part, + dim_t j, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t n; @@ -409,15 +455,18 @@ void bli_acquire_mpart_r2l( subpart_t requested_part, // Modify i to account for the fact that we are moving backwards. j = n - j - b; - bli_acquire_mpart_l2r( requested_part, j, b, obj, sub_obj ); + bli_acquire_mpart_l2r( req_part, j, b, obj, sub_obj ); } -void bli_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_tl2br + ( + subpart_t req_part, + dim_t ij, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -435,14 +484,14 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_tl2br( requested_part, ij, b, obj, sub_obj ); + bli_packm_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_tl2br_check( requested_part, ij, b, obj, sub_obj ); + bli_acquire_mpart_tl2br_check( req_part, ij, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -469,7 +518,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // subpartition is being requested, assuming no transposition. // Left column of subpartitions - if ( requested_part == BLIS_SUBPART00 ) + if ( req_part == BLIS_SUBPART00 ) { // A00 (offm,offn) unchanged. // A00 is ij x ij. @@ -478,7 +527,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = ij; } - else if ( requested_part == BLIS_SUBPART10 ) + else if ( req_part == BLIS_SUBPART10 ) { // A10 (offm,offn) += (ij,0). // A10 is b x ij. @@ -487,7 +536,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = ij; } - else if ( requested_part == BLIS_SUBPART20 ) + else if ( req_part == BLIS_SUBPART20 ) { // A20 (offm,offn) += (ij+b,0). // A20 is (m-ij-b) x ij. @@ -498,7 +547,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } // Middle column of subpartitions. - else if ( requested_part == BLIS_SUBPART01 ) + else if ( req_part == BLIS_SUBPART01 ) { // A01 (offm,offn) += (0,ij). // A01 is ij x b. @@ -507,7 +556,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = b; } - else if ( requested_part == BLIS_SUBPART11 ) + else if ( req_part == BLIS_SUBPART11 ) { // A11 (offm,offn) += (ij,ij). // A11 is b x b. @@ -516,7 +565,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = b; } - else if ( requested_part == BLIS_SUBPART21 ) + else if ( req_part == BLIS_SUBPART21 ) { // A21 (offm,offn) += (ij+b,ij). // A21 is (m-ij-b) x b. @@ -527,7 +576,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } // Right column of subpartitions. - else if ( requested_part == BLIS_SUBPART02 ) + else if ( req_part == BLIS_SUBPART02 ) { // A02 (offm,offn) += (0,ij+b). // A02 is ij x (n-ij-b). @@ -536,7 +585,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = n - ij - b; } - else if ( requested_part == BLIS_SUBPART12 ) + else if ( req_part == BLIS_SUBPART12 ) { // A12 (offm,offn) += (ij,ij+b). // A12 is b x (n-ij-b). @@ -545,7 +594,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = n - ij - b; } - else // if ( requested_part == BLIS_SUBPART22 ) + else // if ( req_part == BLIS_SUBPART22 ) { // A22 (offm,offn) += (ij+b,ij+b). // A22 is (m-ij-b) x (n-ij-b). @@ -588,9 +637,9 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // we let the subpartition inherit the storage structure of its immediate // parent. if ( !bli_obj_root_is_general( *sub_obj ) && - requested_part != BLIS_SUBPART00 && - requested_part != BLIS_SUBPART11 && - requested_part != BLIS_SUBPART22 ) + req_part != BLIS_SUBPART00 && + req_part != BLIS_SUBPART11 && + req_part != BLIS_SUBPART22 ) { // FGVZ: Fix me. This needs to be cleaned up. Either non-diagonal // intersecting subpartitions should inherit their root object's @@ -638,11 +687,14 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } -void bli_acquire_mpart_br2tl( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_br2tl + ( + subpart_t req_part, + dim_t ij, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { // Query the dimension of the object. dim_t mn = bli_obj_length( *obj ); @@ -650,35 +702,73 @@ void bli_acquire_mpart_br2tl( subpart_t requested_part, // Modify ij to account for the fact that we are moving backwards. ij = mn - ij - b; - bli_acquire_mpart_tl2br( requested_part, ij, b, obj, sub_obj ); + bli_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj ); } // -- Vector partitioning ------------------------------------------------------ -void bli_acquire_vpart_f2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_vpart_f2b + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { if ( bli_obj_is_col_vector( *obj ) ) - bli_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( *obj ) ) - bli_acquire_mpart_l2r( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_l2r( req_part, i, b, obj, sub_obj ); } -void bli_acquire_vpart_b2f( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_vpart_b2f + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { if ( bli_obj_is_col_vector( *obj ) ) - bli_acquire_mpart_b2t( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_b2t( req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( *obj ) ) - bli_acquire_mpart_r2l( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_r2l( req_part, i, b, obj, sub_obj ); +} + + +// -- Scalar acquisition ------------------------------------------------------- + + +void bli_acquire_mij + ( + dim_t i, + dim_t j, + obj_t* obj, + obj_t* sub_obj + ) +{ + obj_t tmp_obj; + + bli_acquire_mpart_l2r( BLIS_SUBPART1, j, 1, obj, &tmp_obj ); + bli_acquire_mpart_t2b( BLIS_SUBPART1, i, 1, &tmp_obj, sub_obj ); +} + + +void bli_acquire_vi + ( + dim_t i, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( bli_obj_is_col_vector( *obj ) ) + bli_acquire_mpart_t2b( BLIS_SUBPART1, i, 1, obj, sub_obj ); + else // if ( bli_obj_is_row_vector( *obj ) ) + bli_acquire_mpart_l2r( BLIS_SUBPART1, i, 1, obj, sub_obj ); } diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h index ed1fa0d15..fd24f1d82 100644 --- a/frame/base/bli_part.h +++ b/frame/base/bli_part.h @@ -36,50 +36,60 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_b2t( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + obj_t* obj, \ + obj_t* sub_obj \ + ); -void bli_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_r2l( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +GENPROT( acquire_mpart_mdim ) +GENPROT( acquire_mpart_ndim ) -void bli_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_br2tl( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0( opname ) \ + ( \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + obj_t* obj, \ + obj_t* sub_obj \ + ); + +GENPROT( acquire_mpart_t2b ) +GENPROT( acquire_mpart_b2t ) +GENPROT( acquire_mpart_l2r ) +GENPROT( acquire_mpart_r2l ) +GENPROT( acquire_mpart_tl2br ) +GENPROT( acquire_mpart_br2tl ) // -- Vector partitioning ------------------------------------------------------ -void bli_acquire_vpart_f2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +GENPROT( acquire_vpart_f2b ) +GENPROT( acquire_vpart_b2f ) -void bli_acquire_vpart_b2f( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +// -- Scalar acquisition ------------------------------------------------------- + +void bli_acquire_mij + ( + dim_t i, + dim_t j, + obj_t* obj, + obj_t* sub_obj + ); + +void bli_acquire_vi + ( + dim_t i, + obj_t* obj, + obj_t* sub_obj + ); diff --git a/frame/base/old/bli_mem.c.prev b/frame/base/old/bli_mem.c.prev new file mode 100644 index 000000000..7a16e8732 --- /dev/null +++ b/frame/base/old/bli_mem.c.prev @@ -0,0 +1,366 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS +extern pthread_mutex_t mem_manager_mutex; +#endif + +// Declare one memory pool structure for each block size/shape we want to +// be able to allocate. + +static pool_t pools[3]; + + +// Physically contiguous memory for each pool. +// +// Generally speaking, the pool sizes are computed in a sub-header of blis.h +// as follows: +// +// BLIS_MK_POOL_SIZE = BLIS_MAXIMUM_MC_? * BLIS_MAXIMUM_KC_? * BLIS_SIZEOF_? +// +// where "?" is the datatype that results in the largest pool size. The +// constants BLIS_KN_POOL_SIZE and BLIS_MN_POOL_SIZE are computed in a +// similar manner. All constants are computed with appropriate "padding" +// to ensure enough space given the alignments required by bli_config.h. +// + +static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ]; +static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ]; +static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ]; + +#define BLIS_USE_HEAP + +#ifdef BLIS_USE_HEAP +static char* pool_mk_mem = NULL; +static char* pool_kn_mem = NULL; +static char* pool_mn_mem = NULL; +#else +static char pool_mk_mem[ BLIS_MK_POOL_SIZE ]; +static char pool_kn_mem[ BLIS_KN_POOL_SIZE ]; +static char pool_mn_mem[ BLIS_MN_POOL_SIZE ]; +#endif + + + +void bli_mem_acquire_m( siz_t req_size, + packbuf_t buf_type, + mem_t* mem ) +{ + siz_t block_size; + dim_t pool_index; + pool_t* pool; + void** block_ptrs; + void* block; + gint_t i; + + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffer requests, such as those used by level-2 + // operations, using bli_malloc() is sufficient, since using + // physically contiguous memory is not as important there. + block = bli_malloc( req_size ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), and + // - the size of the requested region. + // NOTE: We do not initialize the pool field since this block did not + // come from a contiguous memory pool. + bli_mem_set_buffer( block, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_size( req_size, mem ); + } + else + { + // This branch handles cases where the memory block needs to come + // from one of the contiguous memory pools. + + // Map the requested packed buffer type to a zero-based index, which + // we then use to select the corresponding memory pool. + pool_index = bli_packbuf_index( buf_type ); + pool = &pools[ pool_index ]; + + // Unconditionally perform error checking on the memory pool. + { + err_t e_val; + + // Make sure that the requested matrix size fits inside of a block + // of the corresponding pool. + e_val = bli_check_requested_block_size_for_pool( req_size, pool ); + bli_check_error_code( e_val ); + + // Make sure that the pool contains at least one block to check out + // to the thread. + e_val = bli_check_if_exhausted_pool( pool ); + bli_check_error_code( e_val ); + } + + // Access the block pointer array from the memory pool data structure. + block_ptrs = bli_pool_block_ptrs( pool ); + + + // BEGIN CRITICAL SECTION +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + { + + // Query the index of the contiguous memory block that resides at the + // "top" of the pool. + i = bli_pool_top_index( pool ); + + // Extract the address of the top block from the block pointer array. + block = block_ptrs[i]; + + // Clear the entry from the block pointer array. (This is actually not + // necessary.) + //block_ptrs[i] = NULL; + + // Decrement the top of the memory pool. + bli_pool_dec_top_index( pool ); + + + // END CRITICAL SECTION + } +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + + // Query the size of the blocks in the pool so we can store it in the + // mem_t object. + block_size = bli_pool_block_size( pool ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), + // - the address of the memory pool to which it belongs, and + // - the size of the contiguous memory block (NOT the size of the + // requested region). + bli_mem_set_buffer( block, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_pool( pool, mem ); + bli_mem_set_size( block_size, mem ); + } +} + + +void bli_mem_release( mem_t* mem ) +{ + packbuf_t buf_type; + pool_t* pool; + void** block_ptrs; + void* block; + gint_t i; + + // Extract the address of the memory block we are trying to + // release. + block = bli_mem_buffer( mem ); + + // Extract the buffer type so we know what kind of memory was allocated. + buf_type = bli_mem_buf_type( mem ); + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffers, we allocate with bli_malloc(), and so + // here we need to call bli_free(). + bli_free( block ); + } + else + { + // This branch handles cases where the memory block came from one + // of the contiguous memory pools. + + // Extract the pool from which the block was allocated. + pool = bli_mem_pool( mem ); + + // Extract the block pointer array associated with the pool. + block_ptrs = bli_pool_block_ptrs( pool ); + + + // BEGIN CRITICAL SECTION +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + { + + // Increment the top of the memory pool. + bli_pool_inc_top_index( pool ); + + // Query the newly incremented top index. + i = bli_pool_top_index( pool ); + + // Place the address of the block back onto the top of the memory pool. + block_ptrs[i] = block; + + + // END CRITICAL SECTION + } +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + } + + + // Clear the mem_t object so that it appears unallocated. We clear: + // - the buffer field, + // - the pool field, and + // - the size field. + // NOTE: We do not clear the buf_type field since there is no + // "uninitialized" value for packbuf_t. + bli_mem_set_buffer( NULL, mem ); + bli_mem_set_pool( NULL, mem ); + bli_mem_set_size( 0, mem ); +} + + +void bli_mem_acquire_v( siz_t req_size, + mem_t* mem ) +{ + bli_mem_acquire_m( req_size, + BLIS_BUFFER_FOR_GEN_USE, + mem ); +} + + + +void bli_mem_init() +{ + dim_t index_a; + dim_t index_b; + dim_t index_c; + +#ifdef BLIS_USE_HEAP + pool_mk_mem = bli_malloc( BLIS_MK_POOL_SIZE ); + pool_kn_mem = bli_malloc( BLIS_KN_POOL_SIZE ); + pool_mn_mem = bli_malloc( BLIS_MN_POOL_SIZE ); +#endif + + // Map each of the packbuf_t values to an index starting at zero. + index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + // Initialize contiguous memory pool for MC x KC blocks. + bli_mem_init_pool( pool_mk_mem, + BLIS_MK_BLOCK_SIZE, + BLIS_NUM_MC_X_KC_BLOCKS, + pool_mk_blk_ptrs, + &pools[ index_a ] ); + + // Initialize contiguous memory pool for KC x NC blocks. + bli_mem_init_pool( pool_kn_mem, + BLIS_KN_BLOCK_SIZE, + BLIS_NUM_KC_X_NC_BLOCKS, + pool_kn_blk_ptrs, + &pools[ index_b ] ); + + // Initialize contiguous memory pool for MC x NC blocks. + bli_mem_init_pool( pool_mn_mem, + BLIS_MN_BLOCK_SIZE, + BLIS_NUM_MC_X_NC_BLOCKS, + pool_mn_blk_ptrs, + &pools[ index_c ] ); +} + + +void bli_mem_init_pool( char* pool_mem, + siz_t block_size, + dim_t num_blocks, + void** block_ptrs, + pool_t* pool ) +{ + const siz_t align_size = BLIS_CONTIG_ADDR_ALIGN_SIZE; + dim_t i; + + // If the pool starting address is not already aligned, advance it + // accordingly. + if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) + { + // Notice that this works even if the alignment is not a power of two. + pool_mem += ( ( uintptr_t )align_size - + ( ( uintptr_t )pool_mem % align_size ) ); + } + + // Step through the memory pool, beginning with the aligned address + // determined above, assigning pointers to the beginning of each block_size + // bytes to the ith element of the block_ptrs array. + for ( i = 0; i < num_blocks; ++i ) + { + // Save the address of pool, which is guaranteed to be aligned. + block_ptrs[i] = pool_mem; + + // Advance pool by one block. + pool_mem += block_size; + + // Advance pool a bit further if needed in order to get to the + // beginning of an alignment boundary. + if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) + { + pool_mem += ( ( uintptr_t )align_size - + ( ( uintptr_t )pool_mem % align_size ) ); + } + } + + // Now that we have initialized the array of pointers to the individual + // blocks in the pool, we initialize a pool_t data structure so that we + // can easily manage this pool. + bli_pool_init( num_blocks, + block_size, + block_ptrs, + pool ); +} + + + +void bli_mem_finalize() +{ + // Nothing to do. + +#ifdef BLIS_USE_HEAP + bli_free( pool_mk_mem ); + bli_free( pool_kn_mem ); + bli_free( pool_mn_mem ); +#endif + +} + diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c index 24aa192e3..1b63e0b7e 100644 --- a/frame/compat/bla_amax.c +++ b/frame/compat/bla_amax.c @@ -80,7 +80,8 @@ f77_int PASTEF772(i,chx,blasname) \ ); \ \ /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) - index. */ \ + index. Also, if the BLAS integer size differs from the BLIS + integer size, that typecast occurs here. */ \ f77_index = bli_index + 1; \ \ /* Finalize BLIS (if it was initialized above). */ \ diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 9ac03de97..a50968845 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -46,6 +46,5 @@ extern obj_t BLIS_MINUS_TWO; extern thrcomm_t BLIS_SINGLE_COMM; extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; -extern thrinfo_t BLIS_HERK_SINGLE_THREADED; #endif diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 00a2aa4b9..355412e2b 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -705,6 +705,24 @@ // Level-1v // +// amaxv kernels + +#ifndef BLIS_SAMAXV_KERNEL +#define BLIS_SAMAXV_KERNEL BLIS_SAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_DAMAXV_KERNEL +#define BLIS_DAMAXV_KERNEL BLIS_DAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_CAMAXV_KERNEL +#define BLIS_CAMAXV_KERNEL BLIS_CAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_ZAMAXV_KERNEL +#define BLIS_ZAMAXV_KERNEL BLIS_ZAMAXV_KERNEL_REF +#endif + // addv kernels #ifndef BLIS_SADDV_KERNEL diff --git a/frame/include/bli_kernel_pre_macro_defs.h b/frame/include/bli_kernel_pre_macro_defs.h index 98e4c3928..30ed3e3f2 100644 --- a/frame/include/bli_kernel_pre_macro_defs.h +++ b/frame/include/bli_kernel_pre_macro_defs.h @@ -260,6 +260,13 @@ #define BLIS_CADDV_KERNEL_REF bli_caddv_ref #define BLIS_ZADDV_KERNEL_REF bli_zaddv_ref +// amaxv kernels + +#define BLIS_SAMAXV_KERNEL_REF bli_samaxv_ref +#define BLIS_DAMAXV_KERNEL_REF bli_damaxv_ref +#define BLIS_CAMAXV_KERNEL_REF bli_camaxv_ref +#define BLIS_ZAMAXV_KERNEL_REF bli_zamaxv_ref + // axpbyv kernels #define BLIS_SAXPBYV_KERNEL_REF bli_saxpbyv_ref diff --git a/frame/include/bli_kernel_prototypes.h b/frame/include/bli_kernel_prototypes.h index e693825ff..b788bbc1c 100644 --- a/frame/include/bli_kernel_prototypes.h +++ b/frame/include/bli_kernel_prototypes.h @@ -164,6 +164,11 @@ #define bli_caddv_ker_name BLIS_CADDV_KERNEL #define bli_zaddv_ker_name BLIS_ZADDV_KERNEL +#define bli_samaxv_ker_name BLIS_SAMAXV_KERNEL +#define bli_damaxv_ker_name BLIS_DAMAXV_KERNEL +#define bli_camaxv_ker_name BLIS_CAMAXV_KERNEL +#define bli_zamaxv_ker_name BLIS_ZAMAXV_KERNEL + #define bli_saxpbyv_ker_name BLIS_SAXPBYV_KERNEL #define bli_daxpbyv_ker_name BLIS_DAXPBYV_KERNEL #define bli_caxpbyv_ker_name BLIS_CAXPBYV_KERNEL diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 01cf44e79..d99be2345 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -120,14 +120,12 @@ #include "bli_gentfunc_macro_defs.h" #include "bli_gentprot_macro_defs.h" -#include "bli_mem_macro_defs.h" #include "bli_obj_macro_defs.h" #include "bli_param_macro_defs.h" #include "bli_complex_macro_defs.h" #include "bli_scalar_macro_defs.h" #include "bli_error_macro_defs.h" #include "bli_blas_macro_defs.h" -#include "bli_auxinfo_macro_defs.h" #endif diff --git a/frame/include/bli_mem_macro_defs.h b/frame/include/bli_mem_macro_defs.h deleted file mode 100644 index d0fe850cd..000000000 --- a/frame/include/bli_mem_macro_defs.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2016 Hewlett Packard Enterprise Development LP - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_MEM_MACRO_DEFS_H -#define BLIS_MEM_MACRO_DEFS_H - - -// Mem entry query - -#define bli_mem_pblk( mem_p ) \ -\ - ( &((mem_p)->pblk) ) - -#define bli_mem_buffer( mem_p ) \ -\ - ( bli_pblk_buf_align( bli_mem_pblk( mem_p ) ) ) - -#define bli_mem_buf_sys( mem_p ) \ -\ - ( bli_pblk_buf_sys( bli_mem_pblk( mem_p ) ) ) - -#define bli_mem_buf_type( mem_p ) \ -\ - ( (mem_p)->buf_type ) - -#define bli_mem_pool( mem_p ) \ -\ - ( (mem_p)->pool ) - -#define bli_mem_membrk( mem_p ) \ -\ - ( (mem_p)->membrk ) - -#define bli_mem_size( mem_p ) \ -\ - ( (mem_p)->size ) - -#define bli_mem_is_alloc( mem_p ) \ -\ - ( bli_mem_buffer( mem_p ) != NULL ) - -#define bli_mem_is_unalloc( mem_p ) \ -\ - ( bli_mem_buffer( mem_p ) == NULL ) - - -// Mem entry modification - -#define bli_mem_set_pblk( pblk_p, mem_p ) \ -{ \ - mem_p->pblk = *(pblk_p); \ -} - -#define bli_mem_set_buffer( buf0, mem_p ) \ -{ \ - bli_pblk_set_buf_align( buf0, &(mem_p->pblk) ); \ -} - -#define bli_mem_set_buf_sys( buf0, mem_p ) \ -{ \ - bli_pblk_set_buf_sys( buf0, &(mem_p->pblk) ); \ -} - -#define bli_mem_set_buf_type( buf_type0, mem_p ) \ -{ \ - (mem_p)->buf_type = buf_type0; \ -} - -#define bli_mem_set_pool( pool0, mem_p ) \ -{ \ - (mem_p)->pool = pool0; \ -} - -#define bli_mem_set_membrk( membrk0, mem_p ) \ -{ \ - (mem_p)->membrk = membrk0; \ -} - -#define bli_mem_set_size( size0, mem_p ) \ -{ \ - mem_p->size = size0; \ -} - -#define bli_mem_clear( mem_p ) \ -{ \ - bli_mem_set_buffer( NULL, mem_p ); \ - bli_mem_set_buf_sys( NULL, mem_p ); \ - bli_mem_set_pool( NULL, mem_p ); \ - bli_mem_set_size( 0, mem_p ); \ - bli_mem_set_membrk( NULL, mem_p ); \ -} - - -#endif diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 306c09544..0d5992900 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -812,21 +812,6 @@ bli_obj_width_stored( obj ) (obj).elem_size = size; \ } - -// Pack mem_t entry query - -#define bli_obj_pack_mem( obj ) \ -\ - ( &((obj).pack_mem) ) - -// Pack mem_t entry modification - -#define bli_obj_set_pack_mem( mem_p, obj ) \ -{ \ - (obj).pack_mem = *mem_p; \ -} - - // Packed matrix info query #define bli_obj_padded_length( obj ) \ @@ -839,6 +824,12 @@ bli_obj_width_stored( obj ) // Packed matrix info modification +#define bli_obj_set_buffer_to_mem( mem_p, obj ) \ +{ \ + void* buf = bli_mem_buffer( mem_p ); \ + bli_obj_set_buffer( buf, obj ); \ +} \ + #define bli_obj_set_padded_length( m0, obj ) \ { \ (obj).m_padded = m0; \ @@ -900,15 +891,7 @@ bli_obj_width_stored( obj ) // -- Miscellaneous object macros -- -// Make a special alias (shallow copy) that does not overwrite pack_mem -// entry. - -#define bli_obj_alias_for_packing( a, b ) \ -{ \ - bli_obj_init_basic_shallow_copy_of( a, b ); \ -} - -// Make a full alias (shallow copy), including pack_mem and friends +// Make a full alias (shallow copy) #define bli_obj_alias_to( a, b ) \ { \ @@ -948,28 +931,6 @@ bli_obj_width_stored( obj ) } -// Initialize object for packing purposes - -#define bli_obj_init_pack( obj_p ) \ -{ \ - mem_t* pack_mem_ = bli_obj_pack_mem( *obj_p ); \ -\ - bli_mem_set_buffer( NULL, pack_mem_ ); \ -} - - -// Release object's pack mem_t entries back to memory manager - -#define bli_obj_release_pack( obj_p ) \ -{ \ - mem_t* pack_mem_ = bli_obj_pack_mem( *(obj_p) ); \ -\ - if ( bli_mem_is_alloc( pack_mem_ ) ) \ - bli_membrk_release( pack_mem_ ); \ -} - - - // Submatrix/scalar buffer acquisition #define BLIS_CONSTANT_SLOT_SIZE BLIS_MAX_TYPE_SIZE diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 8869cea17..50ddd5d1f 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -1104,6 +1104,14 @@ else if ( bli_is_scomplex( dt ) ) PASTEMAC(c,fname)(o0,o1,o2); \ else if ( bli_is_dcomplex( dt ) ) PASTEMAC(z,fname)(o0,o1,o2); \ } +#define bli_call_ft_3i( dt, fname, o0, o1, o2 ) \ +{ \ + if ( bli_is_float( dt ) ) PASTEMAC(s,fname)(o0,o1,o2); \ + else if ( bli_is_double( dt ) ) PASTEMAC(d,fname)(o0,o1,o2); \ + else if ( bli_is_scomplex( dt ) ) PASTEMAC(c,fname)(o0,o1,o2); \ + else if ( bli_is_dcomplex( dt ) ) PASTEMAC(z,fname)(o0,o1,o2); \ + else if ( bli_is_int( dt ) ) PASTEMAC(i,fname)(o0,o1,o2); \ +} #define bli_call_ft_4( dt, fname, o0, o1, o2, o3 ) \ { \ if ( bli_is_float( dt ) ) PASTEMAC(s,fname)(o0,o1,o2,o3); \ diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 5f52c89b7..f4e3e4aa0 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -501,233 +501,14 @@ typedef enum } packbuf_t; -// -// -- BLIS misc. structure types ----------------------------------------------- -// +// -- Partitioning direction -- -// -- Mutex type -- - -typedef struct mtx_s mtx_t; - -// -- Pool block type -- - -typedef struct +typedef enum { - void* buf_sys; - void* buf_align; -} pblk_t; + BLIS_FWD, + BLIS_BWD +} dir_t; -// -- Pool type -- - -typedef struct -{ - pblk_t* block_ptrs; - dim_t block_ptrs_len; - - dim_t top_index; - dim_t num_blocks; - - siz_t block_size; - siz_t align_size; -} pool_t; - -// -- Memory broker object type -- - -typedef struct membrk_s membrk_t; -/* -{ - pool_t pools[3]; - mtx_t mutex; - - malloc_ft malloc_fp; - free_ft free_fp; -} membrk_t; -*/ - -// -- Memory object type -- - -typedef struct mem_s -{ - pblk_t pblk; - packbuf_t buf_type; - pool_t* pool; - membrk_t* membrk; - siz_t size; -} mem_t; - -// -- Blocksize object type -- - -typedef struct blksz_s -{ - // Primary blocksize values. - dim_t v[BLIS_NUM_FP_TYPES]; - - // Blocksize extensions. - dim_t e[BLIS_NUM_FP_TYPES]; - -} blksz_t; - -// -- Function pointer object type -- - -typedef struct func_s -{ - // Kernel function address. - void* ptr[BLIS_NUM_FP_TYPES]; - -} func_t; - -// -- Multi-boolean object type -- - -typedef struct mbool_s -{ - bool_t v[BLIS_NUM_FP_TYPES]; - -} mbool_t; - -// -- Auxiliary kernel info type -- - -// Note: This struct is used by macro-kernels to package together extra -// parameter values that may be of use to the micro-kernel without -// cluttering up the micro-kernel interface itself. - -typedef struct -{ - // The pack schemas of A and B. - pack_t schema_a; - pack_t schema_b; - - // Pointers to the micro-panels of A and B which will be used by the - // next call to the micro-kernel. - void* a_next; - void* b_next; - - // The imaginary strides of A and B. - inc_t is_a; - inc_t is_b; - -} auxinfo_t; - - - -// -// -- BLIS object type definitions --------------------------------------------- -// - -typedef struct obj_s -{ - // Basic fields - struct obj_s* root; - - dim_t off[2]; - dim_t dim[2]; - doff_t diag_off; - - objbits_t info; - siz_t elem_size; - - void* buffer; - inc_t rs; - inc_t cs; - inc_t is; - - // Bufferless scalar storage - atom_t scalar; - - // Pack-related fields - mem_t pack_mem; // cached memory region for packing - dim_t m_padded; // m dimension of matrix, including any padding - dim_t n_padded; // n dimension of matrix, including any padding - inc_t ps; // panel stride (distance to next panel) - inc_t pd; // panel dimension (the "width" of a panel: - // usually MR or NR) - dim_t m_panel; // m dimension of a "full" panel - dim_t n_panel; // n dimension of a "full" panel -} obj_t; - - -// Define these macros here since they must be updated if contents of -// obj_t changes. -#define bli_obj_init_basic_shallow_copy_of( a, b ) \ -{ \ - (b).root = (a).root; \ -\ - (b).off[0] = (a).off[0]; \ - (b).off[1] = (a).off[1]; \ - (b).dim[0] = (a).dim[0]; \ - (b).dim[1] = (a).dim[1]; \ - (b).diag_off = (a).diag_off; \ -\ - (b).info = (a).info; \ - (b).elem_size = (a).elem_size; \ -\ - (b).buffer = (a).buffer; \ - (b).rs = (a).rs; \ - (b).cs = (a).cs; \ - (b).is = (a).is; \ -\ - (b).scalar = (a).scalar; \ -\ - /* We must NOT copy pack_mem field since this macro forms the basis of - bli_obj_alias_to(), which is used in packm_init(). There, we want to - copy the basic fields of the obj_t but PRESERVE the pack_mem field - of the destination object since it holds the "cached" mem_t object - and buffer. The other fields, such as padded dimensions, are always - set by bli_packm_init(), so we don't need to copy them either. */ \ -} - -#define bli_obj_init_full_shallow_copy_of( a, b ) \ -{ \ - /* This macro implements a full alias (shallow copy) that copies all - fields of the obj_t struct. */ \ - bli_obj_init_basic_shallow_copy_of( a, b ); \ -\ - (b).pack_mem = (a).pack_mem; \ - (b).m_padded = (a).m_padded; \ - (b).n_padded = (a).n_padded; \ - (b).ps = (a).ps; \ - (b).pd = (a).pd; \ - (b).m_panel = (a).m_panel; \ - (b).n_panel = (a).n_panel; \ -} - -#define bli_obj_init_subpart_from( a, b ) \ -{ \ - (b).root = (a).root; \ -\ - (b).off[0] = (a).off[0]; \ - (b).off[1] = (a).off[1]; \ - /* Avoid copying m since it will be overwritten. */ \ - /* Avoid copying n since it will be overwritten. */ \ - (b).diag_off = (a).diag_off; \ -\ - (b).info = (a).info; \ - (b).elem_size = (a).elem_size; \ -\ - (b).buffer = (a).buffer; \ - (b).rs = (a).rs; \ - (b).cs = (a).cs; \ - (b).is = (a).is; \ -\ - (b).scalar = (a).scalar; \ -\ - /* We want to copy the pack_mem field here because this macro is used - when creating subpartitions, including those of packed objects. In - those situations, we want the subpartition to inherit the pack_mem - field of its parent, as well as other related fields such as the - padded dimensions. */ \ - (b).pack_mem = (a).pack_mem; \ - (b).m_padded = (a).m_padded; \ - (b).n_padded = (a).n_padded; \ - (b).pd = (a).pd; \ - (b).ps = (a).ps; \ - (b).m_panel = (a).m_panel; \ - (b).n_panel = (a).n_panel; \ -} - - -// -// -- Other BLIS enumerated type definitions ----------------------------------- -// // -- Subpartition type -- @@ -782,6 +563,7 @@ typedef enum #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 + // -- Induced method types -- typedef enum @@ -798,11 +580,13 @@ typedef enum #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) + // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, + BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, @@ -817,7 +601,8 @@ typedef enum BLIS_XPBYV_KER, } l1vkr_t; -#define BLIS_NUM_LEVEL1V_KERS 13 +#define BLIS_NUM_LEVEL1V_KERS 14 + typedef enum { @@ -830,6 +615,7 @@ typedef enum #define BLIS_NUM_LEVEL1F_KERS 5 + typedef enum { BLIS_GEMM_UKR = 0, @@ -841,6 +627,7 @@ typedef enum #define BLIS_NUM_LEVEL3_UKRS 5 + typedef enum { BLIS_REFERENCE_UKERNEL = 0, @@ -902,11 +689,245 @@ typedef enum BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_VF, // level-1v vector fusing factor + + BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 13 +// +// -- BLIS misc. structure types ----------------------------------------------- +// + +// -- Mutex type -- + +typedef struct mtx_s mtx_t; + +// -- Pool block type -- + +typedef struct +{ + void* buf_sys; + void* buf_align; +} pblk_t; + +// -- Pool type -- + +typedef struct +{ + pblk_t* block_ptrs; + dim_t block_ptrs_len; + + dim_t top_index; + dim_t num_blocks; + + siz_t block_size; + siz_t align_size; +} pool_t; + +// -- Memory broker object type -- + +typedef struct membrk_s membrk_t; +/* +{ + pool_t pools[3]; + mtx_t mutex; + + malloc_ft malloc_fp; + free_ft free_fp; +} membrk_t; +*/ + +// -- Memory object type -- + +typedef struct mem_s +{ + pblk_t pblk; + packbuf_t buf_type; + pool_t* pool; + membrk_t* membrk; + siz_t size; +} mem_t; + +// -- Control tree node type -- + +struct cntl_s +{ + // Basic fields (usually required). + bszid_t bszid; + void* var_func; + struct cntl_s* sub_node; + + // Optional fields (needed only by some operations such as packm). + // NOTE: first field of params must be a uint64_t containing the size + // of the struct. + void* params; + + // Internal fields that track "cached" data. + mem_t pack_mem; +}; +typedef struct cntl_s cntl_t; + + +// -- Blocksize object type -- + +typedef struct blksz_s +{ + // Primary blocksize values. + dim_t v[BLIS_NUM_FP_TYPES]; + + // Blocksize extensions. + dim_t e[BLIS_NUM_FP_TYPES]; + +} blksz_t; + + +// -- Function pointer object type -- + +typedef struct func_s +{ + // Kernel function address. + void* ptr[BLIS_NUM_FP_TYPES]; + +} func_t; + + +// -- Multi-boolean object type -- + +typedef struct mbool_s +{ + bool_t v[BLIS_NUM_FP_TYPES]; + +} mbool_t; + + +// -- Auxiliary kernel info type -- + +// Note: This struct is used by macro-kernels to package together extra +// parameter values that may be of use to the micro-kernel without +// cluttering up the micro-kernel interface itself. + +typedef struct +{ + // The pack schemas of A and B. + pack_t schema_a; + pack_t schema_b; + + // Pointers to the micro-panels of A and B which will be used by the + // next call to the micro-kernel. + void* a_next; + void* b_next; + + // The imaginary strides of A and B. + inc_t is_a; + inc_t is_b; + +} auxinfo_t; + + +// +// -- BLIS object type definitions --------------------------------------------- +// + +typedef struct obj_s +{ + // Basic fields + struct obj_s* root; + + dim_t off[2]; + dim_t dim[2]; + doff_t diag_off; + + objbits_t info; + siz_t elem_size; + + void* buffer; + inc_t rs; + inc_t cs; + inc_t is; + + // Bufferless scalar storage + atom_t scalar; + + // Pack-related fields + dim_t m_padded; // m dimension of matrix, including any padding + dim_t n_padded; // n dimension of matrix, including any padding + inc_t ps; // panel stride (distance to next panel) + inc_t pd; // panel dimension (the "width" of a panel: + // usually MR or NR) + dim_t m_panel; // m dimension of a "full" panel + dim_t n_panel; // n dimension of a "full" panel +} obj_t; + + +// Define these macros here since they must be updated if contents of +// obj_t changes. + +#define bli_obj_init_full_shallow_copy_of( a, b ) \ +{ \ + (b).root = (a).root; \ +\ + (b).off[0] = (a).off[0]; \ + (b).off[1] = (a).off[1]; \ + (b).dim[0] = (a).dim[0]; \ + (b).dim[1] = (a).dim[1]; \ + (b).diag_off = (a).diag_off; \ +\ + (b).info = (a).info; \ + (b).elem_size = (a).elem_size; \ +\ + (b).buffer = (a).buffer; \ + (b).rs = (a).rs; \ + (b).cs = (a).cs; \ + (b).is = (a).is; \ +\ + (b).scalar = (a).scalar; \ +\ + /*(b).pack_mem = (a).pack_mem;*/ \ + (b).m_padded = (a).m_padded; \ + (b).n_padded = (a).n_padded; \ + (b).ps = (a).ps; \ + (b).pd = (a).pd; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ +} + +#define bli_obj_init_subpart_from( a, b ) \ +{ \ + (b).root = (a).root; \ +\ + (b).off[0] = (a).off[0]; \ + (b).off[1] = (a).off[1]; \ + /* Avoid copying m since it will be overwritten. */ \ + /* Avoid copying n since it will be overwritten. */ \ + (b).diag_off = (a).diag_off; \ +\ + (b).info = (a).info; \ + (b).elem_size = (a).elem_size; \ +\ + (b).buffer = (a).buffer; \ + (b).rs = (a).rs; \ + (b).cs = (a).cs; \ + (b).is = (a).is; \ +\ + (b).scalar = (a).scalar; \ +\ + /* We want to copy the pack_mem field here because this macro is used + when creating subpartitions, including those of packed objects. In + those situations, we want the subpartition to inherit the pack_mem + field of its parent, as well as other related fields such as the + padded dimensions. */ \ + /*(b).pack_mem = (a).pack_mem;*/ \ + (b).m_padded = (a).m_padded; \ + (b).n_padded = (a).n_padded; \ + (b).pd = (a).pd; \ + (b).ps = (a).ps; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ +} + + // -- Context type -- typedef struct cntx_s @@ -923,6 +944,7 @@ typedef struct cntx_s func_t packm_ukrs; + opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; diff --git a/frame/include/blis.h b/frame/include/blis.h index 32fca0c71..0eaaf413f 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -106,6 +106,7 @@ extern "C" { #include "bli_ind.h" #include "bli_membrk.h" #include "bli_pool.h" +#include "bli_memsys.h" #include "bli_mem.h" #include "bli_part.h" #include "bli_prune.h" @@ -113,6 +114,7 @@ extern "C" { #include "bli_blksz.h" #include "bli_func.h" #include "bli_mbool.h" +#include "bli_auxinfo.h" #include "bli_param_map.h" #include "bli_clock.h" #include "bli_check.h" diff --git a/frame/include/level0/bli_gets.h b/frame/include/level0/bli_gets.h index 36e9af5c3..92d018159 100644 --- a/frame/include/level0/bli_gets.h +++ b/frame/include/level0/bli_gets.h @@ -46,27 +46,38 @@ #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } + +#define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } +#define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } +#define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } +#define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } +#define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) +#define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif diff --git a/frame/include/level0/bli_sets.h b/frame/include/level0/bli_sets.h index 551d03025..61bd7e426 100644 --- a/frame/include/level0/bli_sets.h +++ b/frame/include/level0/bli_sets.h @@ -45,11 +45,13 @@ #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } +#define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } +#define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX @@ -57,11 +59,13 @@ #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } +#define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } +#define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX @@ -77,11 +81,18 @@ #endif // BLIS_ENABLE_C99_COMPLEX +#define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } +#define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } +#define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } +#define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } +#define bli_iisets( xr, xi, y ) { (y) = (xr); } + #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) +#define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif diff --git a/frame/include/old/bli_kernel_post_macro_defs.h b/frame/include/old/bli_kernel_post_macro_defs.h deleted file mode 100644 index 4a261b033..000000000 --- a/frame/include/old/bli_kernel_post_macro_defs.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_POST_MACRO_DEFS_H -#define BLIS_KERNEL_POST_MACRO_DEFS_H - - -// -- Maximum register blocksize search ---------------------------------------- - -// The macro-kernels oftentimes need to statically allocate a temporary -// MR x NR micro-tile of C. This micro-tile must be sized such that it will -// work for both native and induced implementations, since the user can switch -// between them at runtime. In order to facilitate the sizing of those -// micro-tiles, we must determine the largest the register blocksizes would -// need to be to accommodate both native and induced-based complex -// micro-kernels. For real datatypes, the maximum is never larger than the -// actual s and d register blocksizes. However, for complex datatypes, the -// "native" register blocksizes may differ from the "virtual" register -// blocksizes used by the induced implementations. Usually, it is the register -// blocksizes used for induced-based complex micro-kernels that would be -// larger, and thus determine the maximum for c and z datatypes. But, we -// prefer not to assume this, therefore, we always take the larger of the -// two values. - -#define BLIS_DEFAULT_IND_MR_C BLIS_DEFAULT_MR_S -#define BLIS_DEFAULT_IND_NR_C BLIS_DEFAULT_NR_S -#define BLIS_DEFAULT_IND_MR_Z BLIS_DEFAULT_MR_D -#define BLIS_DEFAULT_IND_NR_Z BLIS_DEFAULT_NR_D - -// -// Find the largest register blocksize MR. -// - -#define BLIS_MAX_DEFAULT_MR_S BLIS_DEFAULT_MR_S -#define BLIS_MAX_DEFAULT_MR_D BLIS_DEFAULT_MR_D - -// Choose between the native and induced blocksize for scomplex. -#define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_MR_C -#if BLIS_DEFAULT_IND_MR_C > BLIS_MAX_DEFAULT_MR_C -#undef BLIS_MAX_DEFAULT_MR_C -#define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_IND_MR_C -#endif - -// Choose between the native and induced blocksize for dcomplex. -#define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_MR_Z -#if BLIS_DEFAULT_IND_MR_Z > BLIS_MAX_DEFAULT_MR_Z -#undef BLIS_MAX_DEFAULT_MR_Z -#define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_IND_MR_Z -#endif - -// -// Find the largest register blocksize NR. -// - -#define BLIS_MAX_DEFAULT_NR_S BLIS_DEFAULT_NR_S -#define BLIS_MAX_DEFAULT_NR_D BLIS_DEFAULT_NR_D - -// Choose between the native and induced blocksize for scomplex. -#define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_NR_C -#if BLIS_DEFAULT_IND_NR_C > BLIS_MAX_DEFAULT_NR_C -#undef BLIS_MAX_DEFAULT_NR_C -#define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_IND_NR_C -#endif - -// Choose between the native and induced blocksize for dcomplex. -#define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_NR_Z -#if BLIS_DEFAULT_IND_NR_Z > BLIS_MAX_DEFAULT_NR_Z -#undef BLIS_MAX_DEFAULT_NR_Z -#define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_IND_NR_Z -#endif - - -// -- Abbreiviated macros ------------------------------------------------------ - -// Here, we shorten the maximum blocksizes found above so that they can be -// derived via the PASTEMAC macro. - -// Maximum MR blocksizes - -#define bli_smaxmr BLIS_MAX_DEFAULT_MR_S -#define bli_dmaxmr BLIS_MAX_DEFAULT_MR_D -#define bli_cmaxmr BLIS_MAX_DEFAULT_MR_C -#define bli_zmaxmr BLIS_MAX_DEFAULT_MR_Z - -// Maximum NR blocksizes - -#define bli_smaxnr BLIS_MAX_DEFAULT_NR_S -#define bli_dmaxnr BLIS_MAX_DEFAULT_NR_D -#define bli_cmaxnr BLIS_MAX_DEFAULT_NR_C -#define bli_zmaxnr BLIS_MAX_DEFAULT_NR_Z - - -#endif - diff --git a/frame/include/old/bli_kernel_prototypes.h b/frame/include/old/bli_kernel_prototypes.h deleted file mode 100644 index 333b2c578..000000000 --- a/frame/include/old/bli_kernel_prototypes.h +++ /dev/null @@ -1,529 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_PROTOTYPES_H -#define BLIS_KERNEL_PROTOTYPES_H - - -// -- Define PASTEMAC-friendly kernel function name macros --------------------- - -// -// Level-3 -// - -// gemm micro-kernels - -#define bli_sGEMM_UKERNEL BLIS_SGEMM_UKERNEL -#define bli_dGEMM_UKERNEL BLIS_DGEMM_UKERNEL -#define bli_cGEMM_UKERNEL BLIS_CGEMM_UKERNEL -#define bli_zGEMM_UKERNEL BLIS_ZGEMM_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a, \ - ctype* b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMM_UKERNEL ) - -// gemmtrsm_l micro-kernels - -#define bli_sGEMMTRSM_L_UKERNEL BLIS_SGEMMTRSM_L_UKERNEL -#define bli_dGEMMTRSM_L_UKERNEL BLIS_DGEMMTRSM_L_UKERNEL -#define bli_cGEMMTRSM_L_UKERNEL BLIS_CGEMMTRSM_L_UKERNEL -#define bli_zGEMMTRSM_L_UKERNEL BLIS_ZGEMMTRSM_L_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a10, \ - ctype* a11, \ - ctype* b01, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMMTRSM_L_UKERNEL ) - -// gemmtrsm_u micro-kernels - -#define bli_sGEMMTRSM_U_UKERNEL BLIS_SGEMMTRSM_U_UKERNEL -#define bli_dGEMMTRSM_U_UKERNEL BLIS_DGEMMTRSM_U_UKERNEL -#define bli_cGEMMTRSM_U_UKERNEL BLIS_CGEMMTRSM_U_UKERNEL -#define bli_zGEMMTRSM_U_UKERNEL BLIS_ZGEMMTRSM_U_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a12, \ - ctype* a11, \ - ctype* b21, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMMTRSM_U_UKERNEL ) - -// trsm_l micro-kernels - -#define bli_sTRSM_L_UKERNEL BLIS_STRSM_L_UKERNEL -#define bli_dTRSM_L_UKERNEL BLIS_DTRSM_L_UKERNEL -#define bli_cTRSM_L_UKERNEL BLIS_CTRSM_L_UKERNEL -#define bli_zTRSM_L_UKERNEL BLIS_ZTRSM_L_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - ctype* a11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( TRSM_L_UKERNEL ) - -// trsm_u micro-kernels - -#define bli_sTRSM_U_UKERNEL BLIS_STRSM_U_UKERNEL -#define bli_dTRSM_U_UKERNEL BLIS_DTRSM_U_UKERNEL -#define bli_cTRSM_U_UKERNEL BLIS_CTRSM_U_UKERNEL -#define bli_zTRSM_U_UKERNEL BLIS_ZTRSM_U_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - ctype* a11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( TRSM_U_UKERNEL ) - - -// -// Level-1m -// - -// NOTE: We don't need any PASTEMAC-friendly aliases to packm kernel -// macros because they are used directly in the initialization of the -// function pointer array, rather than via a templatizing wrapper macro. - - -// -// Level-1f -// - -// axpy2v kernels - -#define bli_sssAXPY2V_KERNEL BLIS_SAXPY2V_KERNEL -#define bli_dddAXPY2V_KERNEL BLIS_DAXPY2V_KERNEL -#define bli_cccAXPY2V_KERNEL BLIS_CAXPY2V_KERNEL -#define bli_zzzAXPY2V_KERNEL BLIS_ZAXPY2V_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_xy* alpha1, \ - ctype_xy* alpha2, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_z* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( AXPY2V_KERNEL ) - -// dotaxpyv kernels - -#define bli_sssDOTAXPYV_KERNEL BLIS_SDOTAXPYV_KERNEL -#define bli_dddDOTAXPYV_KERNEL BLIS_DDOTAXPYV_KERNEL -#define bli_cccDOTAXPYV_KERNEL BLIS_CDOTAXPYV_KERNEL -#define bli_zzzDOTAXPYV_KERNEL BLIS_ZDOTAXPYV_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,kername) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype_x* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_xy* rho, \ - ctype_z* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTAXPYV_KERNEL ) - -// axpyf kernels - -#define bli_sssAXPYF_KERNEL BLIS_SAXPYF_KERNEL -#define bli_dddAXPYF_KERNEL BLIS_DAXPYF_KERNEL -#define bli_cccAXPYF_KERNEL BLIS_CAXPYF_KERNEL -#define bli_zzzAXPYF_KERNEL BLIS_ZAXPYF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( AXPYF_KERNEL ) - -// dotxf kernels - -#define bli_sssDOTXF_KERNEL BLIS_SDOTXF_KERNEL -#define bli_dddDOTXF_KERNEL BLIS_DDOTXF_KERNEL -#define bli_cccDOTXF_KERNEL BLIS_CDOTXF_KERNEL -#define bli_zzzDOTXF_KERNEL BLIS_ZDOTXF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_x* x, inc_t incx, \ - ctype_y* beta, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXF_KERNEL ) - -// dotxaxpyf kernels - -#define bli_sssDOTXAXPYF_KERNEL BLIS_SDOTXAXPYF_KERNEL -#define bli_dddDOTXAXPYF_KERNEL BLIS_DDOTXAXPYF_KERNEL -#define bli_cccDOTXAXPYF_KERNEL BLIS_CDOTXAXPYF_KERNEL -#define bli_zzzDOTXAXPYF_KERNEL BLIS_ZDOTXAXPYF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, kername ) \ -\ -void PASTEMAC3(cha,chb,chc,kername) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ab* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_b* w, inc_t incw, \ - ctype_b* x, inc_t incx, \ - ctype_c* beta, \ - ctype_c* y, inc_t incy, \ - ctype_c* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXAXPYF_KERNEL ) - - -// -// Level-1v -// - -// addv kernels - -#define bli_ssADDV_KERNEL BLIS_SADDV_KERNEL -#define bli_ddADDV_KERNEL BLIS_DADDV_KERNEL -#define bli_ccADDV_KERNEL BLIS_CADDV_KERNEL -#define bli_zzADDV_KERNEL BLIS_ZADDV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( ADDV_KERNEL ) - -// axpyv kernels - -#define bli_sssAXPYV_KERNEL BLIS_SAXPYV_KERNEL -#define bli_dddAXPYV_KERNEL BLIS_DAXPYV_KERNEL -#define bli_cccAXPYV_KERNEL BLIS_CAXPYV_KERNEL -#define bli_zzzAXPYV_KERNEL BLIS_ZAXPYV_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_a* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( AXPYV_KERNEL ) - -// copyv kernels - -#define bli_ssCOPYV_KERNEL BLIS_SCOPYV_KERNEL -#define bli_ddCOPYV_KERNEL BLIS_DCOPYV_KERNEL -#define bli_ccCOPYV_KERNEL BLIS_CCOPYV_KERNEL -#define bli_zzCOPYV_KERNEL BLIS_ZCOPYV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( COPYV_KERNEL ) - -// dotv kernels - -#define bli_sssDOTV_KERNEL BLIS_SDOTV_KERNEL -#define bli_dddDOTV_KERNEL BLIS_DDOTV_KERNEL -#define bli_cccDOTV_KERNEL BLIS_CDOTV_KERNEL -#define bli_zzzDOTV_KERNEL BLIS_ZDOTV_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, kername ) \ -\ -void PASTEMAC3(chx,chy,chr,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_r* rho \ - ); - -INSERT_GENTPROT3_BASIC( DOTV_KERNEL ) - -// dotxv kernels - -#define bli_sssDOTXV_KERNEL BLIS_SDOTXV_KERNEL -#define bli_dddDOTXV_KERNEL BLIS_DDOTXV_KERNEL -#define bli_cccDOTXV_KERNEL BLIS_CDOTXV_KERNEL -#define bli_zzzDOTXV_KERNEL BLIS_ZDOTXV_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chr,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_xy* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_r* beta, \ - ctype_r* rho \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXV_KERNEL ) - -// invertv kernels - -#define bli_sINVERTV_KERNEL BLIS_SINVERTV_KERNEL -#define bli_dINVERTV_KERNEL BLIS_DINVERTV_KERNEL -#define bli_cINVERTV_KERNEL BLIS_CINVERTV_KERNEL -#define bli_zINVERTV_KERNEL BLIS_ZINVERTV_KERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - ); - -INSERT_GENTPROT_BASIC( INVERTV_KERNEL ) - -// scal2v kernels - -#define bli_sssSCAL2V_KERNEL BLIS_SSCAL2V_KERNEL -#define bli_dddSCAL2V_KERNEL BLIS_DSCAL2V_KERNEL -#define bli_cccSCAL2V_KERNEL BLIS_CSCAL2V_KERNEL -#define bli_zzzSCAL2V_KERNEL BLIS_ZSCAL2V_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, kername ) \ -\ -void PASTEMAC3(chb,chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( SCAL2V_KERNEL ) - -// scalv kernels - -#define bli_ssSCALV_KERNEL BLIS_SSCALV_KERNEL -#define bli_ddSCALV_KERNEL BLIS_DSCALV_KERNEL -#define bli_ccSCALV_KERNEL BLIS_CSCALV_KERNEL -#define bli_zzSCALV_KERNEL BLIS_ZSCALV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ -\ -void PASTEMAC2(chb,chx,kername) \ - ( \ - conj_t conjbeta, \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx \ - ); - -INSERT_GENTPROT2_BASIC( SCALV_KERNEL ) - -// setv kernels - -#define bli_ssSETV_KERNEL BLIS_SSETV_KERNEL -#define bli_ddSETV_KERNEL BLIS_DSETV_KERNEL -#define bli_ccSETV_KERNEL BLIS_CSETV_KERNEL -#define bli_zzSETV_KERNEL BLIS_ZSETV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ -\ -void PASTEMAC2(chb,chx,kername) \ - ( \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx \ - ); - -INSERT_GENTPROT2_BASIC( SETV_KERNEL ) - -// subv kernels - -#define bli_ssSUBV_KERNEL BLIS_SSUBV_KERNEL -#define bli_ddSUBV_KERNEL BLIS_DSUBV_KERNEL -#define bli_ccSUBV_KERNEL BLIS_CSUBV_KERNEL -#define bli_zzSUBV_KERNEL BLIS_ZSUBV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( SUBV_KERNEL ) - -// swapv kernels - -#define bli_ssSWAPV_KERNEL BLIS_SSWAPV_KERNEL -#define bli_ddSWAPV_KERNEL BLIS_DSWAPV_KERNEL -#define bli_ccSWAPV_KERNEL BLIS_CSWAPV_KERNEL -#define bli_zzSWAPV_KERNEL BLIS_ZSWAPV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( SWAPV_KERNEL ) - - - -#endif - diff --git a/frame/include/old/bli_kernel_type_defs.h b/frame/include/old/bli_kernel_type_defs.h deleted file mode 100644 index e0190fe1b..000000000 --- a/frame/include/old/bli_kernel_type_defs.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_TYPE_DEFS_H -#define BLIS_KERNEL_TYPE_DEFS_H - - -// -// -- BLIS kernel types -------------------------------------------------------- -// - -// Here we generate typedef statements that generate custom types for -// kernel function pointers. Note that we use the function -// prototype-generating macro since it takes the same arguments we need -// to define our types. - -// -- Level-3 kernels -- - -/* -// gemm - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_ukr_t ) - - -// trsm_l/u - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( trsm_ukr_t ) - - -// gemmtrsm_l/u - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1x, \ - ctype* restrict a11, \ - ctype* restrict bx1, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemmtrsm_ukr_t ) - -// -- packm kernels -- - -// packm_struc_cxk - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p \ - ); - -INSERT_GENTPROT_BASIC( packm_ker_t ) -*/ - - - -#endif - diff --git a/frame/ind/oapi/bli_l3_3m4m_oapi.c b/frame/ind/oapi/bli_l3_3m4m_oapi.c index 04f2259d2..40348e627 100644 --- a/frame/ind/oapi/bli_l3_3m4m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m_oapi.c @@ -34,12 +34,6 @@ #include "blis.h" -// Bring control trees into scope. -extern gemm_t* gemm_cntl; -extern trsm_t* trsm_l_cntl; -extern trsm_t* trsm_r_cntl; - - // -- gemm/her2k/syr2k --------------------------------------------------------- #undef GENFRONT @@ -81,10 +75,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, b, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( alpha, a, b, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -161,10 +154,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -239,10 +231,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( alpha, a, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -302,10 +293,9 @@ void PASTEMAC(opname,imeth) \ /* Prepare the context for the ith stage of computation. */ \ PASTEMAC2(cname,imeth,_cntx_stage)( i, cntx_p ); \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -353,11 +343,9 @@ void PASTEMAC(opname,imeth) \ /* NOTE: trsm cannot be implemented via any induced method that needs to execute in stages (e.g. 3mh, 4mh). */ \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, \ - PASTECH(cname,_l_cntl), \ - PASTECH(cname,_r_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -373,10 +361,3 @@ GENFRONT( trsm, trsm, 3m1, 1 ) //GENFRONT( trmm, trsm, 4mb, 1 ) // Unimplementable. GENFRONT( trsm, trsm, 4m1, 1 ) - -// -// ----------------------------------------------------------------------------- -// ----------------------------------------------------------------------------- -// ----------------------------------------------------------------------------- -// - diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 9038067c5..68b664d65 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -34,11 +34,6 @@ #include "blis.h" -// Bring control trees into scope. -extern gemm_t* gemm_cntl; -extern trsm_t* trsm_l_cntl; -extern trsm_t* trsm_r_cntl; - // NOTE: The function definitions in this file can be consolidated with the // definitions for the other induced methods. The only advantage of keeping // them separate is that it allows us to avoid the very small loop overhead @@ -69,8 +64,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - alpha, a, b, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + alpha, a, b, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -107,8 +101,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + side, alpha, a, b, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -143,8 +136,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - alpha, a, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + alpha, a, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -178,8 +170,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, cntx_p, \ - PASTECH(cname,_cntl) \ + side, alpha, a, b, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -212,9 +203,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, cntx_p, \ - PASTECH(cname,_l_cntl), \ - PASTECH(cname,_r_cntl) \ + side, alpha, a, b, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 04f0c34a8..7c1fe69f9 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -201,21 +201,27 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) void bli_l3_thread_decorator ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ) { _Pragma( "omp parallel num_threads(n_threads)" ) { - dim_t omp_id = omp_get_thread_num(); + dim_t omp_id = omp_get_thread_num(); + thrinfo_t* thread_i = thread[omp_id]; + + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); func ( @@ -225,9 +231,12 @@ void bli_l3_thread_decorator beta, c, cntx, - cntl, + cntl_use, thread[omp_id] ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); } } diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 42a9c6979..0f2707d91 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -80,7 +80,7 @@ void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) communicator->n_threads = n_threads; communicator->sense = 0; communicator->threads_arrived = 0; - + #ifdef BLIS_USE_PTHREAD_MUTEX pthread_mutex_init( &communicator->mutex, NULL ); #endif @@ -123,52 +123,71 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) #endif -void* thread_decorator_helper( void* data_void ); +void* bli_l3_thread_entry( void* data_void ); +// A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3_int_t func; - obj_t* alpha; - obj_t* a; - obj_t* b; - obj_t* beta; - obj_t* c; - void* cntx; - void* cntl; - void* thread; + l3int_t func; + obj_t* alpha; + obj_t* a; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + cntl_t* cntl; + thrinfo_t* thread; } thread_data_t; -void* thread_decorator_helper( void* data_void ) +// Entry point for additional threads +void* bli_l3_thread_entry( void* data_void ) { - thread_data_t* data = data_void; + thread_data_t* data = data_void; + + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + cntl_t* cntl = data->cntl; + thrinfo_t* thread_i = data->thread; + + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); data->func ( - data->alpha, - data->a, - data->b, - data->beta, - data->c, - data->cntx, - data->cntl, - data->thread + alpha, + a, + b, + beta, + c, + cntx, + cntl_use, + thread ); + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + return NULL; } void bli_l3_thread_decorator ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ) { pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); @@ -176,22 +195,38 @@ void bli_l3_thread_decorator for ( int i = 1; i < n_threads; i++ ) { - //Setup the thread data - datas[i].func = func; - datas[i].alpha = alpha; - datas[i].a = a; - datas[i].b = b; - datas[i].beta = beta; - datas[i].c = c; - datas[i].cntx = cntx; - datas[i].cntl = cntl; + // Set up thread data for additional threads (beyond thread 0). + datas[i].func = func; + datas[i].alpha = alpha; + datas[i].a = a; + datas[i].b = b; + datas[i].beta = beta; + datas[i].c = c; + datas[i].cntx = cntx; + datas[i].cntl = cntl; datas[i].thread = thread[i]; - pthread_create( &pthreads[i], NULL, &thread_decorator_helper, &datas[i] ); + // Spawn additional threads. + pthread_create( &pthreads[i], NULL, &bli_l3_thread_entry, &datas[i] ); } - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); + // The main thread executes this. + { + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + + // Thread 0 simply executes func. + func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread[0] ); + } + + + // Thread 0 waits for additional threads to finish. for ( int i = 1; i < n_threads; i++) { pthread_join( pthreads[i], NULL ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index fb2bc97bb..99de67220 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -36,24 +36,6 @@ #ifndef BLIS_ENABLE_MULTITHREADING -void bli_l3_thread_decorator - ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread - ) -{ - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); -} - - //Constructors and destructors for constructors thrcomm_t* bli_thrcomm_create( dim_t n_threads ) { @@ -89,5 +71,43 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) return; } +void bli_l3_thread_decorator + ( + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread + ) +{ + thrinfo_t* thread_i = thread[0]; + + cntl_t* cntl_use; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + cntl_use, + thread[0] + ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); +} + + #endif diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index a4f69aeba..43f0eaf8b 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -38,7 +38,6 @@ static bool_t bli_thread_is_init = FALSE; thrinfo_t BLIS_PACKM_SINGLE_THREADED = {}; thrinfo_t BLIS_GEMM_SINGLE_THREADED = {}; -thrinfo_t BLIS_HERK_SINGLE_THREADED = {}; thrcomm_t BLIS_SINGLE_COMM = {}; // ----------------------------------------------------------------------------- @@ -51,7 +50,6 @@ void bli_thread_init( void ) bli_thrcomm_init( &BLIS_SINGLE_COMM, 1 ); bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); - bli_l3_thrinfo_init_single( &BLIS_HERK_SINGLE_THREADED ); // Mark API as initialized. bli_thread_is_init = TRUE; @@ -70,7 +68,7 @@ bool_t bli_thread_is_initialized( void ) // ----------------------------------------------------------------------------- -void bli_thread_get_range +void bli_thread_get_range_sub ( thrinfo_t* thread, dim_t n, @@ -224,8 +222,8 @@ siz_t bli_thread_get_range_l2r dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, n, bf, - FALSE, start, end ); + bli_thread_get_range_sub( thr, n, bf, + FALSE, start, end ); return m * ( *end - *start ); } @@ -243,8 +241,8 @@ siz_t bli_thread_get_range_r2l dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, n, bf, - TRUE, start, end ); + bli_thread_get_range_sub( thr, n, bf, + TRUE, start, end ); return m * ( *end - *start ); } @@ -262,8 +260,8 @@ siz_t bli_thread_get_range_t2b dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, m, bf, - FALSE, start, end ); + bli_thread_get_range_sub( thr, m, bf, + FALSE, start, end ); return n * ( *end - *start ); } @@ -281,12 +279,14 @@ siz_t bli_thread_get_range_b2t dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, m, bf, - TRUE, start, end ); + bli_thread_get_range_sub( thr, m, bf, + TRUE, start, end ); return n * ( *end - *start ); } +// ----------------------------------------------------------------------------- + dim_t bli_thread_get_range_width_l ( doff_t diagoff_j, @@ -496,7 +496,9 @@ siz_t bli_find_area_trap_l return ( siz_t )area; } -siz_t bli_thread_get_range_weighted +// ----------------------------------------------------------------------------- + +siz_t bli_thread_get_range_weighted_sub ( thrinfo_t* thread, doff_t diagoff, @@ -570,11 +572,15 @@ siz_t bli_thread_get_range_weighted { // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. - width_j = bli_thread_get_range_width_l( diagoff_j, m, n_left, - j, n_way, - bf, bf_left, - area_per_thr, - handle_edge_low ); + width_j = + bli_thread_get_range_width_l + ( + diagoff_j, m, n_left, + j, n_way, + bf, bf_left, + area_per_thr, + handle_edge_low + ); // If the current thread belongs to caucus j, this is his // subpartition. So we compute the implied index range and @@ -611,9 +617,12 @@ siz_t bli_thread_get_range_weighted bli_toggle_bool( handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. - area = bli_thread_get_range_weighted( thread, diagoff, uplo, m, n, bf, - handle_edge_low, - j_start_thr, j_end_thr ); + area = bli_thread_get_range_weighted_sub + ( + thread, diagoff, uplo, m, n, bf, + handle_edge_low, + j_start_thr, j_end_thr + ); // Reverse the indexing basis for the subpartition ranges so that // the indices, relative to left-to-right iteration through the @@ -626,6 +635,124 @@ siz_t bli_thread_get_range_weighted return area; } +siz_t bli_thread_get_range_mdim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + obj_t* b, + obj_t* c, + cntl_t* cntl, + cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntx_get_family( cntx ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( *a ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + obj_t* x; + bool_t use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } + else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end ); + else + return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_t2b( thr, x, bmult, start, end ); + else + return bli_thread_get_range_b2t( thr, x, bmult, start, end ); + } +} + +siz_t bli_thread_get_range_ndim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + obj_t* b, + obj_t* c, + cntl_t* cntl, + cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntx_get_family( cntx ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( *b ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + obj_t* x; + bool_t use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } + else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end ); + else + return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_l2r( thr, x, bmult, start, end ); + else + return bli_thread_get_range_r2l( thr, x, bmult, start, end ); + } +} + siz_t bli_thread_get_range_weighted_l2r ( thrinfo_t* thr, @@ -656,13 +783,20 @@ siz_t bli_thread_get_range_weighted_l2r bli_reflect_about_diag( diagoff, uplo, m, n ); } - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - FALSE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + FALSE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_l2r( thr, a, bmult, - start, end ); + area = bli_thread_get_range_l2r + ( + thr, a, bmult, + start, end + ); } return area; @@ -700,13 +834,20 @@ siz_t bli_thread_get_range_weighted_r2l bli_rotate180_trapezoid( diagoff, uplo ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - TRUE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + TRUE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_r2l( thr, a, bmult, - start, end ); + area = bli_thread_get_range_r2l + ( + thr, a, bmult, + start, end + ); } return area; @@ -744,13 +885,20 @@ siz_t bli_thread_get_range_weighted_t2b bli_reflect_about_diag( diagoff, uplo, m, n ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - FALSE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + FALSE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_t2b( thr, a, bmult, - start, end ); + area = bli_thread_get_range_t2b + ( + thr, a, bmult, + start, end + ); } return area; @@ -790,18 +938,25 @@ siz_t bli_thread_get_range_weighted_b2t bli_rotate180_trapezoid( diagoff, uplo ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - TRUE, start, end ); + area = bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + TRUE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_b2t( thr, a, bmult, - start, end ); + area = bli_thread_get_range_b2t + ( + thr, a, bmult, + start, end + ); } return area; } +// ----------------------------------------------------------------------------- // Some utilities dim_t bli_env_read_nway( char* env ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 2498baf8c..10097c39e 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -72,7 +72,7 @@ void bli_thread_finalize( void ); bool_t bli_thread_is_initialized( void ); // Thread range-related prototypes. -void bli_thread_get_range +void bli_thread_get_range_sub ( thrinfo_t* thread, dim_t n, @@ -82,6 +82,25 @@ void bli_thread_get_range dim_t* end ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +siz_t PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + thrinfo_t* thr, \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntl_t* cntl, \ + cntx_t* cntx, \ + dim_t* start, \ + dim_t* end \ + ); + +GENPROT( thread_get_range_mdim ) +GENPROT( thread_get_range_ndim ) + #undef GENPROT #define GENPROT( opname ) \ \ @@ -123,7 +142,7 @@ siz_t bli_find_area_trap_l dim_t n, doff_t diagoff ); -siz_t bli_thread_get_range_weighted +siz_t bli_thread_get_range_weighted_sub ( thrinfo_t* thread, doff_t diagoff, @@ -139,31 +158,31 @@ siz_t bli_thread_get_range_weighted // Level-3 internal function type -typedef void (*l3_int_t) +typedef void (*l3int_t) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void* thread + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread ); // Level-3 thread decorator prototype void bli_l3_thread_decorator ( - dim_t num_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + dim_t n_threads, + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ); // Miscellaneous prototypes diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index e47006954..4cf55b3d4 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -42,9 +42,8 @@ thrinfo_t* bli_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ) { thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); @@ -55,9 +54,8 @@ thrinfo_t* bli_thrinfo_create ocomm, ocomm_id, icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + free_comms, + sub_node ); return thread; @@ -72,21 +70,19 @@ void bli_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ) { - thread->ocomm = ocomm; - thread->ocomm_id = ocomm_id; - thread->icomm = icomm; - thread->icomm_id = icomm_id; - thread->n_way = n_way; - thread->work_id = work_id; + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->free_comms = free_comms; - thread->opackm = opackm; - thread->ipackm = ipackm; - thread->sub_self = sub_self; + thread->sub_node = sub_node; } void bli_thrinfo_init_single @@ -101,37 +97,8 @@ void bli_thrinfo_init_single &BLIS_SINGLE_COMM, 0, 1, 0, - &BLIS_PACKM_SINGLE_THREADED, - &BLIS_PACKM_SINGLE_THREADED, + FALSE, thread ); } -#if 0 -void bli_thrinfo_free - ( - thrinfo_t* thread - ) -{ - if ( thread == NULL || - thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_HERK_SINGLE_THREADED || - thread == &BLIS_PACKM_SINGLE_THREADED - ) return; - - // Free Communicators - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( thread->ocomm ); - if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( thread->icomm ); - - // Free thrinfo chidren - bli_packm_thrinfo_free( thread->opackm ); - bli_packm_thrinfo_free( thread->ipackm ); - bli_l3_thrinfo_free( thread->sub_self ); - bli_free_intl( thread ); - - return; -} -#endif - diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 3f8a3112b..9c0b28575 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -58,32 +58,43 @@ struct thrinfo_s // What we're working on. dim_t work_id; - struct thrinfo_s* opackm; - struct thrinfo_s* ipackm; - struct thrinfo_s* sub_self; + // When freeing, should the communicators in this node be freed? Usually, + // this is field is true, but when nodes are created that share the same + // communicators as other nodes (such as with packm nodes), this is set + // to false. + bool_t free_comms; + + struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; - -#define bli_thread_num_threads( t ) ( t->ocomm->n_threads ) - -#define bli_thread_n_way( t ) ( t->n_way ) -#define bli_thread_work_id( t ) ( t->work_id ) -#define bli_thread_am_ochief( t ) ( t->ocomm_id == 0 ) -#define bli_thread_am_ichief( t ) ( t->icomm_id == 0 ) - -#define bli_thread_obroadcast( t, ptr ) bli_thrcomm_bcast( t->ocomm, t->ocomm_id, ptr ) -#define bli_thread_ibroadcast( t, ptr ) bli_thrcomm_bcast( t->icomm, t->icomm_id, ptr ) -#define bli_thread_obarrier( t ) bli_thrcomm_barrier( t->ocomm, t->ocomm_id ) -#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( t->icomm, t->icomm_id ) - // -// Generic accessor macros for all thrinfo_t objects. +// thrinfo_t macros +// NOTE: The naming of these should be made consistent at some point. // -#define bli_thrinfo_sub_opackm( t ) ( t->opackm ) -#define bli_thrinfo_sub_ipackm( t ) ( t->ipackm ) -#define bli_thrinfo_sub_self( t ) ( t->sub_self ) +#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads ) + +#define bli_thread_n_way( t ) ( (t)->n_way ) +#define bli_thread_work_id( t ) ( (t)->work_id ) + +#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 ) +#define bli_thread_am_ichief( t ) ( (t)->icomm_id == 0 ) + +#define bli_thread_obroadcast( t, p ) bli_thrcomm_bcast( (t)->ocomm, \ + (t)->ocomm_id, p ) +#define bli_thread_ibroadcast( t, p ) bli_thrcomm_bcast( (t)->icomm, \ + (t)->icomm_id, p ) +#define bli_thread_obarrier( t ) bli_thrcomm_barrier( (t)->ocomm, \ + (t)->ocomm_id ) +#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( (t)->icomm, \ + (t)->icomm_id ) + +#define bli_thrinfo_ocomm( t ) ( (t)->ocomm ) +#define bli_thrinfo_icomm( t ) ( (t)->icomm ) +#define bli_thrinfo_needs_free_comms( t ) ( (t)->free_comms ) + +#define bli_thrinfo_sub_node( t ) ( (t)->sub_node ) // // Prototypes for level-3 thrinfo functions not specific to any operation. @@ -97,9 +108,8 @@ thrinfo_t* bli_thrinfo_create dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ); void bli_thrinfo_init @@ -111,9 +121,8 @@ void bli_thrinfo_init dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comms, + thrinfo_t* sub_node ); void bli_thrinfo_init_single diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c index 760e869b8..7a471995d 100644 --- a/frame/util/bli_util_check.c +++ b/frame/util/bli_util_check.c @@ -38,21 +38,6 @@ // Define object-based check functions. // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* index \ - ) \ -{ \ - bli_utilv_xi_check( x, index ); \ -} - -GENFRONT( amaxv ) - - #undef GENFRONT #define GENFRONT( opname ) \ \ @@ -172,42 +157,6 @@ GENFRONT( sumsqv ) // ----------------------------------------------------------------------------- -void bli_utilv_xi_check - ( - obj_t* x, - obj_t* index - ) -{ - err_t e_val; - - // Check object datatypes. - - e_val = bli_check_floating_object( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_integer_object( index ); - bli_check_error_code( e_val ); - - e_val = bli_check_nonconstant_object( index ); - bli_check_error_code( e_val ); - - // Check object dimensions. - - e_val = bli_check_vector_object( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_scalar_object( index ); - bli_check_error_code( e_val ); - - // Check object buffers (for non-NULLness). - - e_val = bli_check_object_buffer( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_object_buffer( index ); - bli_check_error_code( e_val ); -} - void bli_utilv_xa_check ( obj_t* x, diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h index 0fb23bccd..364ab5923 100644 --- a/frame/util/bli_util_check.h +++ b/frame/util/bli_util_check.h @@ -37,18 +37,6 @@ // Prototype object-based check functions. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* index \ - ); - -GENPROT( amaxv ) - - #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c index abac92b26..2942616c1 100644 --- a/frame/util/bli_util_oapi.c +++ b/frame/util/bli_util_oapi.c @@ -40,44 +40,6 @@ // Define object-based interfaces. // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_CNTX_PARAM \ - ) \ -{ \ - BLIS_OAPI_CNTX_DECL \ -\ - num_t dt = bli_obj_datatype( *x ); \ -\ - dim_t n = bli_obj_vector_dim( *x ); \ - void* buf_x = bli_obj_buffer_at_off( *x ); \ - inc_t incx = bli_obj_vector_inc( *x ); \ -\ - void* buf_index = bli_obj_buffer_at_off( *index ); \ -\ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, index ); \ -\ - /* Invoke the typed function. */ \ - bli_call_ft_5 \ - ( \ - dt, \ - opname, \ - n, \ - buf_x, incx, \ - buf_index, \ - cntx \ - ); \ -} - -GENFRONT( amaxv ) - - #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h index 9de0afadb..f669271fa 100644 --- a/frame/util/bli_util_oapi.h +++ b/frame/util/bli_util_oapi.h @@ -37,19 +37,6 @@ // Prototype object-based interfaces. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_CNTX_PARAM \ - ); - -GENPROT( amaxv ) - - #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index 8fa89d9ae..ad2bb0b40 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -38,50 +38,6 @@ // Define BLAS-like interfaces with typed operands. // -#undef GENTFUNCI -#define GENTFUNCI( ctype, ctype_i, ch, chi, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* index, \ - cntx_t* cntx \ - ) \ -{ \ - cntx_t* cntx_p = cntx; \ -\ - /* If the vector length is zero, set the index to zero and return - early. This directly emulatess the behavior of netlib LAPACK's - i?amax() routines. */ \ - if ( bli_zero_dim1( n ) ) \ - { \ - ctype_i* zero_i = PASTEMAC(chi,0); \ -\ - PASTEMAC(chi,copys)( *zero_i, *index ); \ - return; \ - } \ -\ - /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ -\ - /* Invoke the helper variant, which loops over the appropriate kernel - to implement the current operation. */ \ - PASTEMAC2(ch,opname,_unb_var1) \ - ( \ - n, \ - x, incx, \ - index, \ - cntx_p \ - ); \ -\ - /* Finalize the context if it was initialized locally. */ \ - /*bli_cntx_finalize_local_if( opname, cntx );*/ \ -} - -INSERT_GENTFUNCI_BASIC0( amaxv ) - - #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h index e7dbc73e7..1f3d48a7c 100644 --- a/frame/util/bli_util_tapi.h +++ b/frame/util/bli_util_tapi.h @@ -37,20 +37,6 @@ // Prototype BLAS-like interfaces with typed operands. // -#undef GENTPROTI -#define GENTPROTI( ctype, ctype_i, ch, chi, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* index, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTI_BASIC( amaxv ) - - #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 165e99050..0790f28e1 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -39,71 +39,6 @@ // Define BLAS-like interfaces with typed operands. // -#undef GENTFUNCRI -#define GENTFUNCRI( ctype, ctype_r, ctype_i, ch, chr, chi, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* abmax_i, \ - cntx_t* cntx \ - ) \ -{ \ - ctype_r* minus_one = PASTEMAC(chr,m1); \ - ctype_i* zero_i = PASTEMAC(chi,0); \ -\ - ctype* chi1; \ - ctype_r chi1_r; \ - ctype_r chi1_i; \ - ctype_r abs_chi1; \ - ctype_r abs_chi1_max; \ - ctype_i i_max; \ - dim_t i; \ -\ - /* Initialize the index of the maximum absolute value to zero. */ \ - PASTEMAC(chi,copys)( *zero_i, i_max ); \ -\ - /* Initialize the maximum absolute value search candidate with - -1, which is guaranteed to be less than all values we will - compute. */ \ - PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ -\ - for ( i = 0; i < n; ++i ) \ - { \ - chi1 = x + (i )*incx; \ -\ - /* Get the real and imaginary components of chi1. */ \ - PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ -\ - /* Replace chi1_r and chi1_i with their absolute values. */ \ - PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ - PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ -\ - /* Add the real and imaginary absolute values together. */ \ - PASTEMAC(chr,set0s)( abs_chi1 ); \ - PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ - PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ -\ - /* If the absolute value of the current element exceeds that of - the previous largest, save it and its index. If NaN is - encountered, then treat it the same as if it were a valid - value that was smaller than any previously seen. This - behavior mimics that of LAPACK's ?lange(). */ \ - if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ - { \ - PASTEMAC(chr,copys)( abs_chi1, abs_chi1_max ); \ - PASTEMAC(chi,copys)( i, i_max ); \ - } \ - } \ -\ - /* Store final index to output variable. */ \ - PASTEMAC(chi,copys)( i_max, *abmax_i ); \ -} - -INSERT_GENTFUNCRI_BASIC0( amaxv_unb_var1 ) - - #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h index 369f5f650..09ca31d76 100644 --- a/frame/util/bli_util_unb_var1.h +++ b/frame/util/bli_util_unb_var1.h @@ -37,20 +37,6 @@ // Prototype BLAS-like interfaces with typed operands. // -#undef GENTPROTRI -#define GENTPROTRI( ctype, ctype_r, ctype_i, ch, chr, chi, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* abmax_i, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTRI_BASIC( amaxv_unb_var1 ) - - #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/testsuite/input.general b/testsuite/input.general index b9940dac3..0bf9053bd 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -36,6 +36,7 @@ sdcz # Datatype(s) to test: 1 # 4mh ('1' = enable; '0' = disable) 1 # 4m1b ('1' = enable; '0' = disable) 1 # 4m1a ('1' = enable; '0' = disable) +1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking i # Reaction to test failure: diff --git a/testsuite/input.operations b/testsuite/input.operations index 058721632..ac9298f8b 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -107,6 +107,10 @@ -1 # dimensions: m ? # parameters: conjx +1 # amaxv +1 # test sequential front-end +-1 # dimensions: m + 1 # axpbyv 1 # test sequential front-end -1 # dimensions: m diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c new file mode 100644 index 000000000..9323ecbba --- /dev/null +++ b/testsuite/src/test_amaxv.c @@ -0,0 +1,400 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "amaxv"; +static char* o_types = "v"; // x +static char* p_types = ""; // (no parameters) +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_amaxv_deps + ( + test_params_t* params, + test_op_t* op + ); + +void libblis_test_amaxv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + num_t datatype, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_amaxv_impl + ( + iface_t iface, + obj_t* x, + obj_t* index + ); + +void libblis_test_amaxv_check + ( + test_params_t* params, + obj_t* x, + obj_t* index, + double* resid + ); + +void bli_amaxv_test + ( + obj_t* x, + obj_t* index + ); + + + +void libblis_test_amaxv_deps + ( + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( params, &(op->ops->randv) ); +} + + + +void libblis_test_amaxv + ( + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( op->test_done == TRUE ) return; + + // Return early if operation is disabled. + if ( op->op_switch == DISABLE_ALL || + op->ops->l1v_over == DISABLE_ALL ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_amaxv_deps( params, op ); + + // Execute the test driver for each implementation requested. + if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_amaxv_experiment ); + } +} + + + +void libblis_test_amaxv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + num_t datatype, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + dim_t m; + + obj_t x; + obj_t index; + + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + + + // Create test scalars. + bli_obj_scalar_init_detached( BLIS_INT, &index ); + + // Create test operands (vectors and/or matrices). + libblis_test_vobj_create( params, datatype, sc_str[0], m, &x ); + + // Randomize x. + libblis_test_vobj_randomize( params, FALSE, &x ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + time = bli_clock(); + + libblis_test_amaxv_impl( iface, &x, &index ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( x ) ) *perf *= 2.0; + + // Perform checks. + libblis_test_amaxv_check( params, &x, &index, resid ); + + // Zero out performance and residual if input vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + + // Free the test objects. + bli_obj_free( &x ); +} + + + +void libblis_test_amaxv_impl + ( + iface_t iface, + obj_t* x, + obj_t* index + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_amaxv( x, index ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_amaxv_check + ( + test_params_t* params, + obj_t* x, + obj_t* index, + double* resid + ) +{ + obj_t index_test; + obj_t chi_i; + obj_t chi_i_test; + dim_t i; + dim_t i_test; + + double i_d, junk; + double i_d_test; + + // + // Pre-conditions: + // - x is randomized. + // + // Under these conditions, we assume that the implementation for + // + // index := amaxv( x ) + // + // is functioning correctly if + // + // x[ index ] = max( x ) + // + // where max() is implemented via the bli_?amaxv_test() function. + // + + // The following two calls have already been made by the caller. That + // is, the index object has already been created and the library's + // amaxv implementation has already been tested. + //bli_obj_scalar_init_detached( BLIS_INT, &index ); + //bli_amaxv( x, &index ); + bli_getsc( index, &i_d, &junk ); i = i_d; + bli_acquire_vi( i, x, &chi_i ); + + bli_obj_scalar_init_detached( BLIS_INT, &index_test ); + bli_amaxv_test( x, &index_test ); + bli_getsc( &index_test, &i_d_test, &junk ); i_test = i_d_test; + bli_acquire_vi( i_test, x, &chi_i_test ); + + // Verify that the values referenced by index and index_test are equal. + if ( bli_obj_equals( &chi_i, &chi_i_test ) ) *resid = 0.0; + else *resid = 1.0; +} + +// ----------------------------------------------------------------------------- + +// +// Prototype BLAS-like interfaces with typed operands for a local amaxv test +// operation +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); \ + +INSERT_GENTPROT_BASIC( amaxv_test ) + +// +// Define object-based interface for a local amaxv test operation. +// + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* x, \ + obj_t* index \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *x ); \ +\ + dim_t n = bli_obj_vector_dim( *x ); \ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_index = bli_obj_buffer_at_off( *index ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + bli_amaxv_check( x, index ); \ +\ + /* Invoke the bli_?amaxv_test() function. */ \ + bli_call_ft_5 \ + ( \ + dt, \ + amaxv_test, \ + n, \ + buf_x, incx, \ + buf_index, \ + NULL \ + ); \ +} + +GENFRONT( amaxv_test ) + +// +// Define BLAS-like interfaces with typed operands for a local amaxv test +// operation. +// NOTE: This is based on a simplified version of the bli_?amaxv_ref() +// reference kernel. +// + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* i_max, \ + cntx_t* cntx \ + ) \ +{ \ + ctype_r* minus_one = PASTEMAC(chr,m1); \ + dim_t* zero_i = PASTEMAC(i,0); \ +\ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r abs_chi1; \ + ctype_r abs_chi1_max; \ + dim_t i; \ +\ + /* Initialize the index of the maximum absolute value to zero. */ \ + PASTEMAC(i,copys)( zero_i, *i_max ); \ +\ + /* If the vector length is zero, return early. This directly emulates + the behavior of netlib BLAS's i?amax() routines. */ \ + if ( bli_zero_dim1( n ) ) return; \ +\ + /* Initialize the maximum absolute value search candidate with + -1, which is guaranteed to be less than all values we will + compute. */ \ + PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ +\ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + ctype* chi1 = x + (i )*incx; \ +\ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCR_BASIC0( amaxv_test ) + diff --git a/testsuite/src/test_amaxv.h b/testsuite/src/test_amaxv.h new file mode 100644 index 000000000..364b27963 --- /dev/null +++ b/testsuite/src/test_amaxv.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_amaxv + ( + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 0bb3c4440..514fdf66a 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -220,30 +220,28 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_KR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_KR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); - - // Pack the contents of a and b to ap and bp, respectively. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_KR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_KR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -269,9 +267,10 @@ void libblis_test_gemm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index c74d47d60..afd436d7f 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -260,39 +260,34 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); - - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_MR, - BLIS_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); - - // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - - // Pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_MR, + BLIS_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _make_subparts() routine needs this information // to know how to initialize the subpartitions. bli_obj_set_uplo( uploa, ap ); - // Create subpartitions from the a and b panels. bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &a1xp, &a11p, &bx1p, &b11p ); @@ -302,14 +297,13 @@ void libblis_test_gemmtrsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, a11p ); - // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); time = bli_clock(); @@ -331,9 +325,10 @@ void libblis_test_gemmtrsm_ukr_experiment // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index b86772361..bd14d13b4 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -113,6 +113,7 @@ void libblis_test_utility_ops( test_params_t* params, test_ops_t* ops ) void libblis_test_level1v_ops( test_params_t* params, test_ops_t* ops ) { libblis_test_addv( params, &(ops->addv) ); + libblis_test_amaxv( params, &(ops->amaxv) ); libblis_test_axpbyv( params, &(ops->axpbyv) ); libblis_test_axpyv( params, &(ops->axpyv) ); libblis_test_copyv( params, &(ops->copyv) ); @@ -222,6 +223,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) // Level-1v libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->addv) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 0, &(ops->amaxv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->axpbyv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->axpyv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->copyv) ); @@ -425,7 +427,9 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_4M1A ]) ); - params->ind_enable[ BLIS_NAT ] = 1; + // Read whether to native (complex) execution. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_NAT ]) ); // Read the requested error-checking level. libblis_test_read_next_line( buffer, input_stream ); @@ -943,7 +947,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "problem size: first to test %u\n", params->p_first ); libblis_test_fprintf_c( os, "problem size: max to test %u\n", params->p_max ); libblis_test_fprintf_c( os, "problem size increment %u\n", params->p_inc ); - libblis_test_fprintf_c( os, "test induced complex \n" ); + libblis_test_fprintf_c( os, "complex implementations \n" ); libblis_test_fprintf_c( os, " 3mh? %u\n", params->ind_enable[ BLIS_3MH ] ); libblis_test_fprintf_c( os, " 3m3? %u\n", params->ind_enable[ BLIS_3M3 ] ); libblis_test_fprintf_c( os, " 3m2? %u\n", params->ind_enable[ BLIS_3M2 ] ); @@ -951,7 +955,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, " 4mh? %u\n", params->ind_enable[ BLIS_4MH ] ); libblis_test_fprintf_c( os, " 4m1b (4mb)? %u\n", params->ind_enable[ BLIS_4M1B ] ); libblis_test_fprintf_c( os, " 4m1a (4m1)? %u\n", params->ind_enable[ BLIS_4M1A ] ); - libblis_test_fprintf_c( os, "test native complex? %u\n", params->ind_enable[ BLIS_NAT ] ); + libblis_test_fprintf_c( os, " native? %u\n", params->ind_enable[ BLIS_NAT ] ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); libblis_test_fprintf_c( os, "output in matlab format? %u\n", params->output_matlab_format ); @@ -1503,12 +1507,12 @@ void libblis_test_op_driver( test_params_t* params, // Loop over induced methods (or just BLIS_NAT). for ( indi = ind_first; indi <= ind_last; ++indi ) { - // If the current induced method is native execution, OR - // if the current induced method is implemented (for the - // operation being tested) AND it was requested, then we - // enable ONLY that method and proceed. Otherwise, we - // skip the current method and go to the next method. - if ( indi == BLIS_NAT ) { ; } + // If the current datatype is real, OR if the current + // induced method is implemented (for the operation + // being tested) AND it was requested, then we enable + // ONLY that method and proceed. Otherwise, we skip the + // current method and go to the next method. + if ( bli_is_real( datatype ) ) { ; } else if ( bli_ind_oper_is_impl( op->opid, indi ) && params->ind_enable[ indi ] == 1 ) { ; } else { continue; } @@ -1875,22 +1879,34 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c -void libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) { - // Start with making p and alias to a. - bli_obj_alias_to( *a, *p ); + bool_t does_inv_diag; - // Then initialize p appropriately for packing. - bli_packm_init_pack( inv_diag, - pack_schema, - BLIS_PACK_FWD_IF_UPPER, - BLIS_PACK_FWD_IF_LOWER, - pack_buf, - bmult_id_m, - bmult_id_n, - a, - p, - cntx ); + if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; + else does_inv_diag = TRUE; + + // Create a control tree node for the packing operation. + cntl_t* cntl = bli_packm_cntl_obj_create + ( + NULL, // func ptr is not referenced b/c we don't call via l3 _int(). + bli_packm_blk_var1, + bmult_id_m, + bmult_id_n, + does_inv_diag, + FALSE, + FALSE, + pack_schema, + pack_buf, + NULL // no child node needed + ); + + // Pack the contents of A to P. + bli_l3_packm( a, p, cntx, cntl, &BLIS_PACKM_SINGLE_THREADED ); + + // Return the control tree pointer so the caller can free the cntl_t and its + // mem_t entry later on. + return cntl; } @@ -1932,8 +1948,8 @@ void libblis_test_vobj_randomize( test_params_t* params, bool_t normalize, obj_t bli_obj_scalar_init_detached( dt, &kappa ); bli_obj_scalar_init_detached( dt_r, &kappa_r ); - // Normalize vector elements. - //bli_setsc( 1.0/( double )bli_obj_vector_dim( *x ), 0.0, &kappa ); + // Normalize vector elements. The following code ensures that we + // always invert-scale by whole power of two. bli_normfv( x, &kappa_r ); libblis_test_ceil_pow2( &kappa_r ); bli_copysc( &kappa_r, &kappa ); diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index fab7c1a05..6ecc72d56 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -215,6 +215,7 @@ typedef struct test_ops_s // level-1v test_op_t addv; + test_op_t amaxv; test_op_t axpbyv; test_op_t axpyv; test_op_t copyv; @@ -382,7 +383,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -void libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- @@ -435,6 +436,7 @@ void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid ); // Level-1v #include "test_addv.h" +#include "test_amaxv.h" #include "test_axpbyv.h" #include "test_axpyv.h" #include "test_copyv.h" diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 200a6d1a8..bf5f2d6bd 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -221,40 +221,39 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_MR, - BLIS_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_MR, + BLIS_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _ukernel() wrapper needs this information to // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, ap ); - // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - - // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); bli_copym( &c_save, &c ); @@ -277,9 +276,10 @@ void libblis_test_trsm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a );