diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 2a512626b..fbab40f5b 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -42,12 +42,11 @@ #include "bli_packm_unb_var1.h" #include "bli_packm_blk_var1.h" -#include "bli_packm_blk_var3.h" -#include "bli_packm_blk_var4.h" +#include "bli_packm_blk_var2.h" -#include "bli_packm_gen_cxk.h" -#include "bli_packm_herm_cxk.h" -#include "bli_packm_tri_cxk.h" +#include "bli_packm_struc_cxk.h" +#include "bli_packm_struc_cxk_4m.h" +#include "bli_packm_struc_cxk_3m.h" #include "bli_packm_cxk.h" #include "bli_packm_cxk_4m.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index d49dd00aa..394613c2c 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -54,11 +54,16 @@ typedef void (*FUNCPTR_T)( void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, dim_t pd_p, inc_t ps_p, + void* packm_ker, packm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); +extern func_t* packm_struc_cxk_kers; +extern func_t* packm_struc_cxk_4m_kers; +extern func_t* packm_struc_cxk_3m_kers; + void bli_packm_blk_var1( obj_t* c, obj_t* p, @@ -93,6 +98,9 @@ void bli_packm_blk_var1( obj_t* c, void* buf_kappa; + func_t* packm_kers; + void* packm_ker; + FUNCPTR_T f; // This variant assumes that the micro-kernel will always apply the @@ -101,6 +109,13 @@ void bli_packm_blk_var1( obj_t* c, // scale during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); + // Choose the correct func_t object. + packm_kers = packm_struc_cxk_kers; + + // Query the datatype-specific function pointer from the func_t object. + packm_ker = bli_func_obj_query( dt_cp, packm_kers ); + + // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; @@ -123,12 +138,13 @@ void bli_packm_blk_var1( obj_t* c, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, pd_p, ps_p, + packm_ker, t ); } #undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ +#define GENTFUNC( ctype, ch, varname, kertype ) \ \ void PASTEMAC(ch,varname)( \ struc_t strucc, \ @@ -148,9 +164,12 @@ void PASTEMAC(ch,varname)( \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ + void* packm_ker, \ packm_thrinfo_t* thread \ ) \ { \ + PASTECH(ch,kertype) packm_ker_cast = packm_ker; \ +\ ctype* restrict kappa_cast = kappa; \ ctype* restrict c_cast = c; \ ctype* restrict p_cast = p; \ @@ -301,7 +320,7 @@ void PASTEMAC(ch,varname)( \ /* This case executes if the panel belongs to a triangular matrix AND is diagonal-intersecting. Notice that we cannot bury the following conditional logic into - packm_tri_cxk() because we need to know the value of + packm_struc_cxk() because we need to know the value of panel_len_max_i so we can properly increment p_inc. */ \ \ /* Sanity check. Diagonals should not intersect the short end of @@ -334,22 +353,24 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_tri_cxk)( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p ); \ - }\ + packm_ker_cast( strucc, \ + diagoffp_i, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p ); \ + } \ \ - /* NOTE: This value is usually LESS than ps_p. */ \ + /* NOTE: This value is usually LESS than ps_p because triangular + matrices usually have several micro-panels that are shorter + than a "full" micro-panel. */ \ p_inc = ldp * panel_len_max_i; \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ @@ -363,17 +384,19 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_herm_cxk)( strucc, \ - diagoffc_i, \ - uploc, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ + packm_ker_cast( strucc, \ + diagoffc_i, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ \ /* NOTE: This value is equivalent to ps_p. */ \ @@ -390,17 +413,19 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \ - 0, \ - BLIS_DENSE, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ + packm_ker_cast( BLIS_GENERAL, \ + 0, \ + diagc, \ + BLIS_DENSE, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ /* if ( row_stored ) \ @@ -420,5 +445,5 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC0( packm_blk_var1 ) +INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t ) diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index b157046ef..e05496183 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -58,6 +58,7 @@ void PASTEMAC(ch,varname)( \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ + void* packm_ker, \ packm_thrinfo_t* thread \ ); diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var2.c similarity index 79% rename from frame/1m/packm/bli_packm_blk_var4.c rename to frame/1m/packm/bli_packm_blk_var2.c index 8ff49487a..a002ed95e 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -54,13 +54,18 @@ typedef void (*FUNCPTR_T)( void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, dim_t pd_p, inc_t ps_p, + void* packm_ker, packm_thrinfo_t* thread ); -//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4); +//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2); + +extern func_t* packm_struc_cxk_kers; +extern func_t* packm_struc_cxk_4m_kers; +extern func_t* packm_struc_cxk_3m_kers; -void bli_packm_blk_var4( obj_t* c, +void bli_packm_blk_var2( obj_t* c, obj_t* p, packm_thrinfo_t* t ) { @@ -95,9 +100,13 @@ void bli_packm_blk_var4( obj_t* c, obj_t* kappa_p; void* buf_kappa; + func_t* packm_kers; + void* packm_ker; + FUNCPTR_T f; +/* // We want this variant to behave identically to that of variant 1 // in the real domain. if ( bli_is_real( dt_cp ) ) @@ -105,6 +114,7 @@ void bli_packm_blk_var4( obj_t* c, bli_packm_blk_var1( c, p, t ); return; } +*/ // The value for kappa we use will depend on whether the scalar // attached to A has a nonzero imaginary component. If it does, @@ -140,11 +150,20 @@ void bli_packm_blk_var4( obj_t* c, buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); + // Choose the correct func_t object based on the pack_t schema. + if ( bli_is_4m_packed( schema ) ) packm_kers = packm_struc_cxk_4m_kers; + else if ( bli_is_3m_packed( schema ) ) packm_kers = packm_struc_cxk_3m_kers; + else packm_kers = packm_struc_cxk_kers; + + // Query the datatype-specific function pointer from the func_t object. + packm_ker = bli_func_obj_query( dt_cp, packm_kers ); + + // Index into the type combination array to extract the correct // function pointer. //f = ftypes[dt_cp]; - if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var4; - else f = bli_zpackm_blk_var4; + if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var2; + else f = bli_zpackm_blk_var2; // Invoke the function. f( strucc, @@ -164,12 +183,13 @@ void bli_packm_blk_var4( obj_t* c, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, pd_p, ps_p, + packm_ker, t ); } #undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kertype ) \ \ void PASTEMAC(ch,varname)( \ struc_t strucc, \ @@ -189,9 +209,12 @@ void PASTEMAC(ch,varname)( \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ + void* packm_ker, \ packm_thrinfo_t* thread \ ) \ { \ + PASTECH(ch,kertype) packm_ker_cast = packm_ker; \ +\ ctype* restrict kappa_cast = kappa; \ ctype* restrict c_cast = c; \ ctype* restrict p_cast = p; \ @@ -224,6 +247,7 @@ void PASTEMAC(ch,varname)( \ conj_t conjc; \ bool_t row_stored; \ bool_t col_stored; \ + dim_t ss_p; \ \ ctype* restrict c_use; \ ctype* restrict p_use; \ @@ -294,6 +318,15 @@ void PASTEMAC(ch,varname)( \ m_panel_max = &panel_dim_max; \ n_panel_max = &panel_len_max_i; \ } \ +\ + /* Compute the "storage stride" of p. This is usually equal to ldp, + because usually ps_p = ldp * panel_len_max (e.g. where ldp is + equal to rs_p = packnr, or cs_p = packmr). But for 3m, the product + ldp * panel_len_max must be scaled by 3/2. packm_init() has already + scaled ps_p by 3/2, if needed, so rather than scale the product by + 3/2 manually, we just compute the correct scaling factor and use it + instead of ldp. */ \ + ss_p = ps_p / panel_len_max; \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ @@ -342,7 +375,7 @@ void PASTEMAC(ch,varname)( \ /* This case executes if the panel belongs to a triangular matrix AND is diagonal-intersecting. Notice that we cannot bury the following conditional logic into - packm_tri_cxk() because we need to know the value of + packm_struc_cxk() because we need to know the value of panel_len_max_i so we can properly increment p_inc. */ \ \ /* Sanity check. Diagonals should not intersect the short end of @@ -375,41 +408,25 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_tri_cxk_4m)( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p ); \ + packm_ker_cast( strucc, \ + diagoffp_i, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p ); \ } \ \ - /* NOTE: This value is usually LESS than ps_p. */ \ - p_inc = ldp * panel_len_max_i; \ -\ -/* - if ( rs_p == 1 ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ -/* - if ( cs_p == 1 ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ -\ + /* NOTE: This value is usually LESS than ps_p because triangular + matrices usually have several micro-panels that are shorter + than a "full" micro-panel. */ \ + p_inc = ss_p * panel_len_max_i; \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ @@ -422,21 +439,23 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_herm_cxk_4m)( strucc, \ - diagoffc_i, \ - uploc, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ + packm_ker_cast( strucc, \ + diagoffc_i, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ \ /* NOTE: This value is equivalent to ps_p. */ \ - p_inc = ldp * panel_len_max_i; \ + p_inc = ss_p * panel_len_max_i; \ } \ else \ { \ @@ -449,37 +468,39 @@ void PASTEMAC(ch,varname)( \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ - PASTEMAC(ch,packm_gen_cxk_4m)( BLIS_GENERAL, \ - 0, \ - BLIS_DENSE, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ + packm_ker_cast( BLIS_GENERAL, \ + 0, \ + diagc, \ + BLIS_DENSE, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ } \ /* if ( row_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ } \ */ \ /* if ( col_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ } \ */ \ \ /* NOTE: This value is equivalent to ps_p. */ \ - p_inc = ldp * panel_len_max_i; \ + p_inc = ss_p * panel_len_max_i; \ } \ \ \ @@ -488,5 +509,5 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNCCO_BASIC0( packm_blk_var4 ) +INSERT_GENTFUNCCO_BASIC( packm_blk_var2, packm_ker_t ) diff --git a/frame/1m/packm/bli_packm_blk_var3.h b/frame/1m/packm/bli_packm_blk_var2.h similarity index 93% rename from frame/1m/packm/bli_packm_blk_var3.h rename to frame/1m/packm/bli_packm_blk_var2.h index f991f46c9..65b86dcb4 100644 --- a/frame/1m/packm/bli_packm_blk_var3.h +++ b/frame/1m/packm/bli_packm_blk_var2.h @@ -32,7 +32,7 @@ */ -void bli_packm_blk_var3( obj_t* c, +void bli_packm_blk_var2( obj_t* c, obj_t* p, packm_thrinfo_t* t ); @@ -58,8 +58,9 @@ void PASTEMAC(ch,varname)( \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ - packm_thrinfo_t* thread \ + void* packm_ker, \ + packm_thrinfo_t* t \ ); -INSERT_GENTPROTCO_BASIC( packm_blk_var3 ) +INSERT_GENTPROTCO_BASIC( packm_blk_var2 ) diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c deleted file mode 100644 index 1ded8234b..000000000 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ /dev/null @@ -1,477 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - pack_t schema, - bool_t invdiag, - bool_t revifup, - bool_t reviflo, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p, - packm_thrinfo_t* thread - ); - -//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3); - - -void bli_packm_blk_var3( obj_t* c, - obj_t* p, - packm_thrinfo_t* t ) -{ - num_t dt_cp = bli_obj_datatype( *c ); - - struc_t strucc = bli_obj_struc( *c ); - doff_t diagoffc = bli_obj_diag_offset( *c ); - diag_t diagc = bli_obj_diag( *c ); - uplo_t uploc = bli_obj_uplo( *c ); - trans_t transc = bli_obj_conjtrans_status( *c ); - pack_t schema = bli_obj_pack_status( *p ); - bool_t invdiag = bli_obj_has_inverted_diag( *p ); - bool_t revifup = bli_obj_is_pack_rev_if_upper( *p ); - bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p ); - - dim_t m_p = bli_obj_length( *p ); - dim_t n_p = bli_obj_width( *p ); - dim_t m_max_p = bli_obj_padded_length( *p ); - dim_t n_max_p = bli_obj_padded_width( *p ); - - void* buf_c = bli_obj_buffer_at_off( *c ); - inc_t rs_c = bli_obj_row_stride( *c ); - inc_t cs_c = bli_obj_col_stride( *c ); - - void* buf_p = bli_obj_buffer_at_off( *p ); - inc_t rs_p = bli_obj_row_stride( *p ); - inc_t cs_p = bli_obj_col_stride( *p ); - dim_t pd_p = bli_obj_panel_dim( *p ); - inc_t ps_p = bli_obj_panel_stride( *p ); - - obj_t kappa; - obj_t* kappa_p; - void* buf_kappa; - - FUNCPTR_T f; - - - // We want this variant to behave identically to that of variant 1 - // in the real domain. - if ( bli_is_real( dt_cp ) ) - { - bli_packm_blk_var1( c, p, t ); - return; - } - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing complex domain micro-kernels in terms of their - // real domain counterparts. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( thread_am_ochief( t ) ) - { - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } - } - kappa_p = thread_obroadcast( t, kappa_p ); - - - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); - - - // Index into the type combination array to extract the correct - // function pointer. - //f = ftypes[dt_cp]; - if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var3; - else f = bli_zpackm_blk_var3; - - // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - schema, - invdiag, - revifup, - reviflo, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - pd_p, ps_p, - t ); -} - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool_t invdiag, \ - bool_t revifup, \ - bool_t reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - packm_thrinfo_t* thread \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict c_begin; \ - ctype* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t num_iter; \ - dim_t it, ic, ip; \ - dim_t ic0, ip0; \ - doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - dim_t panel_off_i; \ - inc_t vs_c; \ - inc_t ldc; \ - inc_t ldp, p_inc; \ - dim_t* m_panel_full; \ - dim_t* n_panel_full; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool_t row_stored; \ - bool_t col_stored; \ -\ - ctype* restrict c_use; \ - ctype* restrict p_use; \ - doff_t diagoffp_i; \ -\ -\ - /* If C is zeros and part of a triangular matrix, then we don't need - to pack it. */ \ - if ( bli_is_zeros( uploc ) && \ - bli_is_triangular( strucc ) ) return; \ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( rs_c, cs_c ); \ - bli_negate_diag_offset( diagoffc ); \ - bli_toggle_uplo( uploc ); \ - bli_toggle_trans( transc ); \ - } \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - ldc = rs_c; \ - vs_c = cs_c; \ - diagoffc_inc = -( doff_t )panel_dim_max; \ - ldp = rs_p; \ - m_panel_full = &m; \ - n_panel_full = &panel_dim_i; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - ldc = cs_c; \ - vs_c = rs_c; \ - diagoffc_inc = ( doff_t )panel_dim_max; \ - ldp = cs_p; \ - m_panel_full = &panel_dim_i; \ - n_panel_full = &n; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ - /* Compute the total number of iterations we'll need. */ \ - num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ - /* Set the initial values and increments for indices related to C and P - based on whether reverse iteration was requested. */ \ - if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ - ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ - { \ - ic0 = (num_iter - 1) * panel_dim_max; \ - ic_inc = -panel_dim_max; \ - ip0 = num_iter - 1; \ - ip_inc = -1; \ - } \ - else \ - { \ - ic0 = 0; \ - ic_inc = panel_dim_max; \ - ip0 = 0; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ - c_begin = c_cast + (ic )*vs_c; \ -\ - if ( bli_is_triangular( strucc ) && \ - bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is completely unstored (ie: zero). If the panel - is unstored, we do nothing. (Notice that we don't even - increment p_begin.) */ \ -\ - continue; \ - } \ - else if ( bli_is_triangular( strucc ) && \ - bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is diagonal-intersecting. Notice that we - cannot bury the following conditional logic into - packm_tri_cxk() because we need to know the value of - panel_len_max_i so we can properly increment p_inc. */ \ -\ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc_i < 0 ) || \ - ( row_stored && diagoffc_i > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ - { \ - panel_off_i = 0; \ - panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ - panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \ - diagoffp_i = diagoffc_i; \ - } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ - { \ - panel_off_i = bli_abs( diagoffc_i ); \ - panel_len_i = panel_len_full - panel_off_i; \ - panel_len_max_i = panel_len_max - panel_off_i; \ - diagoffp_i = 0; \ - } \ -\ - c_use = c_begin + (panel_off_i )*ldc; \ - p_use = p_begin; \ -\ - if( packm_thread_my_iter( it, thread ) ) \ - { \ - PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p ); \ - } \ -\ -\ - /* NOTE: This value is usually LESS than (ps_p*3)/2. */ \ - p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ -\ -/* - if ( cs_p == 1 ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ -\ - } \ - else if ( bli_is_herm_or_symm( strucc ) ) \ - { \ - /* This case executes if the panel belongs to a Hermitian or - symmetric matrix, which includes stored, unstored, and - diagonal-intersecting panels. */ \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - if( packm_thread_my_iter( it, thread ) ) \ - { \ - PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \ - diagoffc_i, \ - uploc, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ - } \ -\ - /* NOTE: This value is equivalent to (ps_p*3)/2. */ \ - p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ - } \ - else \ - { \ - /* This case executes if the panel is general, or, if the - panel is part of a triangular matrix and is neither unstored - (ie: zero) nor diagonal-intersecting. */ \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - if( packm_thread_my_iter( it, thread ) ) \ - { \ - PASTEMAC(ch,packm_gen_cxk_3m)( BLIS_GENERAL, \ - 0, \ - BLIS_DENSE, \ - conjc, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p ); \ - } \ -\ - /* NOTE: This value is equivalent to (ps_p*3)/2. */ \ - p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ -\ - } \ -/* - if ( row_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ -\ -\ - p_begin += p_inc; \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_blk_var3 ) - diff --git a/frame/1m/packm/bli_packm_blk_var4.h b/frame/1m/packm/bli_packm_blk_var4.h deleted file mode 100644 index 942e97491..000000000 --- a/frame/1m/packm/bli_packm_blk_var4.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_packm_blk_var4( obj_t* c, - obj_t* p, - packm_thrinfo_t* t ); - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool_t invdiag, \ - bool_t revifup, \ - bool_t reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - packm_thrinfo_t* t \ - ); - -INSERT_GENTPROTCO_BASIC( packm_blk_var4 ) - diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 8bb00a859..d79e7fb00 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -37,6 +37,10 @@ blksz_t* packm_mult_ldim; blksz_t* packm_mult_nvec; +func_t* packm_struc_cxk_kers; +func_t* packm_struc_cxk_4m_kers; +func_t* packm_struc_cxk_3m_kers; + packm_t* packm_cntl_row; packm_t* packm_cntl_col; @@ -47,6 +51,30 @@ packm_t* packm_cntl; void bli_packm_cntl_init() { + // Create function pointer object for each datatype-specific packm + // kernel. + packm_struc_cxk_kers + = + bli_func_obj_create( bli_spackm_struc_cxk, FALSE, + bli_dpackm_struc_cxk, FALSE, + bli_cpackm_struc_cxk, FALSE, + bli_zpackm_struc_cxk, FALSE ); + + packm_struc_cxk_4m_kers + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + bli_cpackm_struc_cxk_4m, FALSE, + bli_zpackm_struc_cxk_4m, FALSE ); + + packm_struc_cxk_3m_kers + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + bli_cpackm_struc_cxk_3m, FALSE, + bli_zpackm_struc_cxk_3m, FALSE ); + + // Create blocksize objects for m and n register blocking. We will attach // these to the packm control node so they can be used to (a) allocate a // block whose m and n dimension are multiples of mr and nr, and (b) know @@ -119,6 +147,10 @@ void bli_packm_cntl_init() void bli_packm_cntl_finalize() { + bli_func_obj_free( packm_struc_cxk_kers ); + bli_func_obj_free( packm_struc_cxk_4m_kers ); + bli_func_obj_free( packm_struc_cxk_3m_kers ); + bli_cntl_obj_free( packm_cntl_row ); bli_cntl_obj_free( packm_cntl_col ); diff --git a/frame/1m/packm/bli_packm_cxk_3m.c b/frame/1m/packm/bli_packm_cxk_3m.c index 59148eefb..65fef586a 100644 --- a/frame/1m/packm/bli_packm_cxk_3m.c +++ b/frame/1m/packm/bli_packm_cxk_3m.c @@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)( dim_t n, void* kappa, void* a, inc_t inca, inc_t lda, - void* p, inc_t psp, inc_t ldp + void* p, inc_t is_p, inc_t ldp ); #undef FUNCPTR_ARRAY_LENGTH @@ -158,7 +158,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ dim_t panel_dim; \ @@ -187,7 +187,7 @@ void PASTEMAC(ch,varname)( \ n, \ kappa, \ a, inca, lda, \ - p, psp, ldp ); \ + p, is_p, ldp ); \ } \ else \ { \ @@ -196,8 +196,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + 1; \ ctype_r* restrict p_r = ( ctype_r* )p; \ - ctype_r* restrict p_i = ( ctype_r* )p + psp; \ - ctype_r* restrict p_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict p_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict p_rpi = ( ctype_r* )p + 2*is_p; \ const dim_t inca2 = 2*inca; \ const dim_t lda2 = 2*lda; \ \ diff --git a/frame/1m/packm/bli_packm_cxk_3m.h b/frame/1m/packm/bli_packm_cxk_3m.h index d8acb61b2..adbe7acb9 100644 --- a/frame/1m/packm/bli_packm_cxk_3m.h +++ b/frame/1m/packm/bli_packm_cxk_3m.h @@ -44,7 +44,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ); INSERT_GENTPROTCO_BASIC( packm_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_cxk_4m.c b/frame/1m/packm/bli_packm_cxk_4m.c index 7261b1358..dd3b60a1a 100644 --- a/frame/1m/packm/bli_packm_cxk_4m.c +++ b/frame/1m/packm/bli_packm_cxk_4m.c @@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)( dim_t n, void* kappa, void* a, inc_t inca, inc_t lda, - void* p, inc_t psp, inc_t ldp + void* p, inc_t is_p, inc_t ldp ); #undef FUNCPTR_ARRAY_LENGTH @@ -159,7 +159,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ dim_t panel_dim; \ @@ -188,7 +188,7 @@ void PASTEMAC(ch,varname)( \ n, \ kappa, \ a, inca, lda, \ - p, psp, ldp ); \ + p, is_p, ldp ); \ } \ else \ { \ @@ -197,7 +197,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + 1; \ ctype_r* restrict p_r = ( ctype_r* )p; \ - ctype_r* restrict p_i = ( ctype_r* )p + psp; \ + ctype_r* restrict p_i = ( ctype_r* )p + is_p; \ const dim_t inca2 = 2*inca; \ const dim_t lda2 = 2*lda; \ \ diff --git a/frame/1m/packm/bli_packm_cxk_4m.h b/frame/1m/packm/bli_packm_cxk_4m.h index c82a77391..0a8d1abf0 100644 --- a/frame/1m/packm/bli_packm_cxk_4m.h +++ b/frame/1m/packm/bli_packm_cxk_4m.h @@ -44,7 +44,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ); INSERT_GENTPROTCO_BASIC( packm_cxk_4m ) diff --git a/frame/1m/packm/bli_packm_gen_cxk.c b/frame/1m/packm/bli_packm_gen_cxk.c deleted file mode 100644 index 022d864cf..000000000 --- a/frame/1m/packm/bli_packm_gen_cxk.c +++ /dev/null @@ -1,401 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype* restrict zero = PASTEMAC(ch,0); \ -\ - dim_t panel_dim; \ - dim_t panel_len; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, ldp ); \ -\ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype* p_edge = p + (i )*rs_p; \ -\ - PASTEMAC(ch,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype* p_edge = p + (j )*cs_p; \ -\ - PASTEMAC(ch,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_gen_cxk ) - - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ -\ - dim_t panel_dim; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t psp, ldp; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ - /* Compute the panel stride (ie: the element offset to the imaginary - panel). */ \ - psp = ldp * panel_len_max; \ -\ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_4m)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ -\ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_4m ) - - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ -\ - dim_t panel_dim; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t psp, ldp; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ - /* Compute the panel stride (ie: the element offset to the imaginary - panel). */ \ - psp = ldp * panel_len_max; \ -\ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_3m)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ -\ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_3m ) - diff --git a/frame/1m/packm/bli_packm_herm_cxk.c b/frame/1m/packm/bli_packm_herm_cxk.c deleted file mode 100644 index 2d0893209..000000000 --- a/frame/1m/packm/bli_packm_herm_cxk.c +++ /dev/null @@ -1,1001 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype* restrict zero = PASTEMAC(ch,0); \ -\ - dim_t i, j; \ - dim_t panel_len; \ - doff_t diagoffc_abs; \ - dim_t panel_dim; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ - ctype* restrict c10; \ - ctype* restrict p10; \ - dim_t p10_dim, p10_len; \ - inc_t incc10, ldc10; \ - doff_t diagoffc10; \ - conj_t conjc10; \ -\ - ctype* restrict c12; \ - ctype* restrict p12; \ - dim_t p12_dim, p12_len; \ - inc_t incc12, ldc12; \ - doff_t diagoffc12; \ - conj_t conjc12; \ -\ - ctype* restrict c11; \ - ctype* restrict p11; \ - dim_t p11_m; \ - dim_t p11_n; \ - inc_t rs_p11, cs_p11; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - rs_p11 = rs_p; \ - cs_p11 = 1; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - rs_p11 = 1; \ - cs_p11 = cs_p; \ - } \ -\ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ - { \ - /* If the current panel is unstored, we need to make a few - adjustments so we refer to the data where it is actually - stored, also taking conjugation into account. (Note this - implicitly assumes we are operating on a dense panel - within a larger symmetric or Hermitian matrix, since a - general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ - { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ - bli_swap_incs( incc, ldc ); \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc ); \ - } \ -\ - /* Pack the full panel. */ \ - PASTEMAC(ch,packm_cxk)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, ldp ); \ - } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ - { \ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \ - ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - diagoffc_abs = bli_abs( diagoffc ); \ -\ - if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs; \ - p10 = p; \ - c10 = c; \ - incc10 = incc; \ - ldc10 = ldc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - diagoffc12 = diagoffc_abs - j; \ - p12 = p + (j )*ldp; \ - c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ - incc12 = ldc; \ - ldc12 = incc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc12 ); \ - } \ - else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs + panel_dim; \ - diagoffc10 = diagoffc; \ - p10 = p; \ - c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ - incc10 = ldc; \ - ldc10 = incc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - p12 = p + (j )*ldp; \ - c12 = c + (j )*ldc; \ - incc12 = incc; \ - ldc12 = ldc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc10 ); \ - } \ -\ - /* Pack to p10. For upper storage, this includes the unstored - triangle of c11. */ \ - PASTEMAC(ch,packm_cxk)( conjc10, \ - p10_dim, \ - p10_len, \ - kappa, \ - c10, incc10, ldc10, \ - p10, ldp ); \ -\ - /* Pack to p12. For lower storage, this includes the unstored - triangle of c11. */ \ - PASTEMAC(ch,packm_cxk)( conjc12, \ - p12_dim, \ - p12_len, \ - kappa, \ - c12, incc12, ldc12, \ - p12, ldp ); \ -\ - /* Pack the stored triangle of c11 to p11. */ \ - { \ - p11_m = panel_dim; \ - p11_n = panel_dim; \ - j = diagoffc_abs; \ - p11 = p + (j )*ldp; \ - c11 = c + (j )*ldc; \ -\ - PASTEMAC(ch,scal2m)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - conjc, \ - p11_m, \ - p11_n, \ - kappa, \ - c11, rs_c, cs_c, \ - p11, rs_p11, cs_p11 ); \ -\ - /* If source matrix c is Hermitian, we have to zero out the - imaginary components of the diagonal of p11 in case the - corresponding elements in c11 were not already zero. */ \ - if ( bli_is_hermitian( strucc ) ) \ - { \ - /* NOTE: We can directly increment p11 since we are done - using p11 for the remainder of the function. */ \ - for ( i = 0; i < p11_m; ++i ) \ - { \ - PASTEMAC(ch,seti0s)( *p11 ); \ -\ - p11 += rs_p11 + cs_p11; \ - } \ - } \ - } \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype* p_edge = p + (i )*rs_p; \ -\ - PASTEMAC(ch,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype* p_edge = p + (j )*cs_p; \ -\ - PASTEMAC(ch,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_herm_cxk ) - - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r* restrict p_r = ( ctype_r* )p; \ -\ - dim_t i, j; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - doff_t diagoffc_abs; \ - dim_t panel_dim; \ - inc_t incc, ldc; \ - inc_t psp, ldp; \ -\ - ctype* restrict c10; \ - ctype_r* restrict p10; \ - dim_t p10_dim, p10_len; \ - inc_t incc10, ldc10; \ - doff_t diagoffc10; \ - conj_t conjc10; \ -\ - ctype* restrict c12; \ - ctype_r* restrict p12; \ - dim_t p12_dim, p12_len; \ - inc_t incc12, ldc12; \ - doff_t diagoffc12; \ - conj_t conjc12; \ -\ - inc_t rs_p11, cs_p11; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - rs_p11 = rs_p; \ - cs_p11 = 1; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - rs_p11 = 1; \ - cs_p11 = cs_p; \ - } \ -\ - /* Compute the panel stride (ie: the element offset to the imaginary - panel). */ \ - psp = ldp * panel_len_max; \ -\ -\ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ - { \ - /* If the current panel is unstored, we need to make a few - adjustments so we refer to the data where it is actually - stored, also taking conjugation into account. (Note this - implicitly assumes we are operating on a dense panel - within a larger symmetric or Hermitian matrix, since a - general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ - { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ - bli_swap_incs( incc, ldc ); \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc ); \ - } \ -\ - /* Pack the full panel. */ \ - PASTEMAC(ch,packm_cxk_4m)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ - } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ - { \ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \ - ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - diagoffc_abs = bli_abs( diagoffc ); \ -\ - if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs; \ - p10 = p_r; \ - c10 = c; \ - incc10 = incc; \ - ldc10 = ldc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - diagoffc12 = diagoffc_abs - j; \ - p12 = p_r + (j )*ldp; \ - c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ - incc12 = ldc; \ - ldc12 = incc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc12 ); \ - } \ - else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs + panel_dim; \ - diagoffc10 = diagoffc; \ - p10 = p_r; \ - c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ - incc10 = ldc; \ - ldc10 = incc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - p12 = p_r + (j )*ldp; \ - c12 = c + (j )*ldc; \ - incc12 = incc; \ - ldc12 = ldc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc10 ); \ - } \ -\ - /* Pack to p10. For upper storage, this includes the unstored - triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_4m)( conjc10, \ - p10_dim, \ - p10_len, \ - kappa, \ - c10, incc10, ldc10, \ - p10, psp, ldp ); \ -\ - /* Pack to p12. For lower storage, this includes the unstored - triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_4m)( conjc12, \ - p12_dim, \ - p12_len, \ - kappa, \ - c12, incc12, ldc12, \ - p12, psp, ldp ); \ -\ - /* Pack the stored triangle of c11 to p11. */ \ - { \ - dim_t p11_m = panel_dim; \ - dim_t p11_n = panel_dim; \ - inc_t rs_c11 = 2*rs_c; \ - inc_t cs_c11 = 2*cs_c; \ - dim_t j = diagoffc_abs; \ - ctype* c11 = ( ctype* )c + (j )*ldc; \ - ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \ - ctype_r* c11_r = ( ctype_r* )c11; \ - ctype_r* c11_i = ( ctype_r* )c11 + 1; \ - ctype_r* p11_r = ( ctype_r* )p11; \ - ctype_r* p11_i = ( ctype_r* )p11 + psp; \ - ctype_r* alpha_r = one_r; \ - ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \ - ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ - ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ -\ - /* Copy the real part of the stored triangle of c11 to p11_r. */ \ - PASTEMAC(chr,scal2m)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - BLIS_NO_TRANSPOSE, \ - p11_m, \ - p11_n, \ - alpha_r, \ - c11_r, rs_c11, cs_c11, \ - p11_r, rs_p11, cs_p11 ); \ -\ - /* Copy the imaginary part of the stored triangle of c11 to p11_i, - scaling by -1 if conjugation on c was requested. */ \ - PASTEMAC(chr,scal2m)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - BLIS_NO_TRANSPOSE, \ - p11_m, \ - p11_n, \ - alpha_i, \ - c11_i, rs_c11, cs_c11, \ - p11_i, rs_p11, cs_p11 ); \ -\ - /* If source matrix c is Hermitian, we have to zero out the - imaginary components of the diagonal of p11 in case the - corresponding elements in c11 were not already zero. */ \ - if ( bli_is_hermitian( strucc ) ) \ - { \ - for ( i = 0; i < p11_m; ++i ) \ - { \ - ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \ -\ - PASTEMAC(chr,set0s)( *pi11_i ); \ - } \ - } \ -\ - /* Apply kappa to the part of p11 that corresponds to the stored - part of c11 that was copied above. */ \ - if ( bli_is_upper( uploc ) ) \ - { \ - PASTEMAC(ch,scalris_mxn_u)( 0, \ - p11_m, \ - p11_n, \ - &kappa_r, \ - &kappa_i, \ - p11_r, \ - p11_i, rs_p11, cs_p11 ); \ - } \ - else \ - { \ - PASTEMAC(ch,scalris_mxn_l)( 0, \ - p11_m, \ - p11_n, \ - &kappa_r, \ - &kappa_i, \ - p11_r, \ - p11_i, rs_p11, cs_p11 ); \ - } \ -/* - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ - p_r + 0*psp, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ - p_r + 1*psp, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ - } \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_4m ) - - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r* restrict p_r = ( ctype_r* )p; \ -\ - dim_t i, j; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - doff_t diagoffc_abs; \ - dim_t panel_dim; \ - inc_t incc, ldc; \ - inc_t psp, ldp; \ -\ - ctype* restrict c10; \ - ctype_r* restrict p10; \ - dim_t p10_dim, p10_len; \ - inc_t incc10, ldc10; \ - doff_t diagoffc10; \ - conj_t conjc10; \ -\ - ctype* restrict c12; \ - ctype_r* restrict p12; \ - dim_t p12_dim, p12_len; \ - inc_t incc12, ldc12; \ - doff_t diagoffc12; \ - conj_t conjc12; \ -\ - inc_t rs_p11, cs_p11; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - rs_p11 = rs_p; \ - cs_p11 = 1; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - rs_p11 = 1; \ - cs_p11 = cs_p; \ - } \ -\ - /* Compute the panel stride (ie: the element offset to the imaginary - panel). */ \ - psp = ldp * panel_len_max; \ -\ -\ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ - { \ - /* If the current panel is unstored, we need to make a few - adjustments so we refer to the data where it is actually - stored, also taking conjugation into account. (Note this - implicitly assumes we are operating on a dense panel - within a larger symmetric or Hermitian matrix, since a - general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ - { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ - bli_swap_incs( incc, ldc ); \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc ); \ - } \ -\ - /* Pack the full panel. */ \ - PASTEMAC(ch,packm_cxk_3m)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ - } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ - { \ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \ - ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - diagoffc_abs = bli_abs( diagoffc ); \ -\ - if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs; \ - p10 = p_r; \ - c10 = c; \ - incc10 = incc; \ - ldc10 = ldc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - diagoffc12 = diagoffc_abs - j; \ - p12 = p_r + (j )*ldp; \ - c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ - incc12 = ldc; \ - ldc12 = incc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc12 ); \ - } \ - else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs + panel_dim; \ - diagoffc10 = diagoffc; \ - p10 = p_r; \ - c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ - incc10 = ldc; \ - ldc10 = incc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - p12 = p_r + (j )*ldp; \ - c12 = c + (j )*ldc; \ - incc12 = incc; \ - ldc12 = ldc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( conjc10 ); \ - } \ -\ - /* Pack to p10. For upper storage, this includes the unstored - triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_3m)( conjc10, \ - p10_dim, \ - p10_len, \ - kappa, \ - c10, incc10, ldc10, \ - p10, psp, ldp ); \ -\ - /* Pack to p12. For lower storage, this includes the unstored - triangle of c11. */ \ - PASTEMAC(ch,packm_cxk_3m)( conjc12, \ - p12_dim, \ - p12_len, \ - kappa, \ - c12, incc12, ldc12, \ - p12, psp, ldp ); \ -\ - /* Pack the stored triangle of c11 to p11. */ \ - { \ - dim_t p11_m = panel_dim; \ - dim_t p11_n = panel_dim; \ - inc_t rs_c11 = 2*rs_c; \ - inc_t cs_c11 = 2*cs_c; \ - dim_t j = diagoffc_abs; \ - ctype* c11 = ( ctype* )c + (j )*ldc; \ - ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \ - ctype_r* c11_r = ( ctype_r* )c11; \ - ctype_r* c11_i = ( ctype_r* )c11 + 1; \ - ctype_r* p11_r = ( ctype_r* )p11; \ - ctype_r* p11_i = ( ctype_r* )p11 + psp; \ - ctype_r* alpha_r = one_r; \ - ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \ - ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ - ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ -\ - /* Copy the real part of the stored triangle of c11 to p11_r. */ \ - PASTEMAC(chr,scal2m)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - BLIS_NO_TRANSPOSE, \ - p11_m, \ - p11_n, \ - alpha_r, \ - c11_r, rs_c11, cs_c11, \ - p11_r, rs_p11, cs_p11 ); \ -\ - /* Copy the imaginary part of the stored triangle of c11 to p11_i, - scaling by -1 if conjugation on c was requested. */ \ - PASTEMAC(chr,scal2m)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - BLIS_NO_TRANSPOSE, \ - p11_m, \ - p11_n, \ - alpha_i, \ - c11_i, rs_c11, cs_c11, \ - p11_i, rs_p11, cs_p11 ); \ -\ - /* If source matrix c is Hermitian, we have to zero out the - imaginary components of the diagonal of p11 in case the - corresponding elements in c11 were not already zero. */ \ - if ( bli_is_hermitian( strucc ) ) \ - { \ - for ( i = 0; i < p11_m; ++i ) \ - { \ - ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \ -\ - PASTEMAC(chr,set0s)( *pi11_i ); \ - } \ - } \ -\ - /* Apply kappa to the part of p11 that corresponds to the stored - part of c11 that was copied above. */ \ - if ( bli_is_upper( uploc ) ) \ - { \ - PASTEMAC(ch,scalris_mxn_u)( 0, \ - p11_m, \ - p11_n, \ - &kappa_r, \ - &kappa_i, \ - p11_r, \ - p11_i, rs_p11, cs_p11 ); \ - } \ - else \ - { \ - PASTEMAC(ch,scalris_mxn_l)( 0, \ - p11_m, \ - p11_n, \ - &kappa_r, \ - &kappa_i, \ - p11_r, \ - p11_i, rs_p11, cs_p11 ); \ - } \ -\ - /* Update the p11 section of the ri panel. It simply needs - to contain the sum of p11_r + p11_i. */ \ - { \ - ctype_r* p11_rpi = p11_i + psp; \ -\ - for ( j = 0; j < p11_n; ++j ) \ - for ( i = 0; i < p11_m; ++i ) \ - { \ - ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \ - ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \ - ctype_r* pi11_rpi = p11_rpi + (i )*rs_p11 + (j )*cs_p11; \ -\ - PASTEMAC(chr,add3s)( *pi11_r, \ - *pi11_i, \ - *pi11_rpi ); \ - } \ - } \ -/* - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ - p_r + 0*psp, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ - p_r + 1*psp, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ - } \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_3m ) - diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 58ebd6d4b..e5da6ad57 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -44,9 +44,9 @@ static FUNCPTR_T vars[6][3] = { // unblocked optimized unblocked blocked { bli_packm_unb_var1, NULL, bli_packm_blk_var1 }, + { NULL, NULL, bli_packm_blk_var2 }, + { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, - { NULL, NULL, bli_packm_blk_var3 }, - { NULL, NULL, bli_packm_blk_var4 }, { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, }; diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c new file mode 100644 index 000000000..c482206a8 --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -0,0 +1,511 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk)( strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk)( strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype* p_edge = p + (i )*rs_p; \ +\ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype* p_edge = p + (j )*cs_p; \ +\ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ + } \ +\ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t i = m_panel; \ + dim_t j = n_panel; \ + dim_t m_br = m_panel_max - i; \ + dim_t n_br = n_panel_max - j; \ + ctype* p_br = p + (i )*rs_p + (j )*cs_p; \ +\ + PASTEMAC(ch,setd)( 0, \ + m_br, \ + n_br, \ + one, \ + p_br, rs_p, cs_p ); \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) + + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ) \ +{ \ + doff_t diagoffc_abs; \ + dim_t i, j; \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Since we don't + have the schema in scope, we must use the dimensions and strides + of the micro-panel to determine whether it is row- or column- + stored. */ \ + row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \ + col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype* restrict c10; \ + ctype* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc10, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, ldp ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc12, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + ctype* restrict c11; \ + ctype* restrict p11; \ + dim_t p11_m; \ + dim_t p11_n; \ + inc_t rs_p11; \ + inc_t cs_p11; \ +\ + p11_m = panel_dim; \ + p11_n = panel_dim; \ + j = diagoffc_abs; \ + p11 = p + (j )*ldp; \ + c11 = c + (j )*ldc; \ +\ + /* Compute the row and column strides of p11. */ \ + if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \ + else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \ +\ +\ + PASTEMAC(ch,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + conjc, \ + p11_m, \ + p11_n, \ + kappa, \ + c11, rs_c, cs_c, \ + p11, rs_p11, cs_p11 ); \ +\ + /* If source matrix c is Hermitian, we have to zero out the + imaginary components of the diagonal of p11 in case the + corresponding elements in c11 were not already zero. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + /* NOTE: We can directly increment p11 since we are done + using p11 for the remainder of the function. */ \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + PASTEMAC(ch,seti0s)( *p11 ); \ +\ + p11 += rs_p11 + cs_p11; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) + + + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ) \ +{ \ + /* Pack the panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ +\ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + PASTEMAC(ch,setd)( diagoffp, \ + m_panel, \ + n_panel, \ + kappa, \ + p, rs_p, cs_p ); \ + } \ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + PASTEMAC(ch,invertd)( diagoffp, \ + m_panel, \ + n_panel, \ + p, rs_p, cs_p ); \ + } \ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + uplo_t uplop = uploc; \ +\ + bli_toggle_uplo( uplop ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \ +\ + PASTEMAC(ch,setm)( diagoffp, \ + BLIS_NONUNIT_DIAG, \ + uplop, \ + m_panel, \ + n_panel, \ + zero, \ + p, rs_p, cs_p ); \ + } \ +\ +\ +} + +INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk ) + diff --git a/frame/1m/packm/bli_packm_tri_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h similarity index 69% rename from frame/1m/packm/bli_packm_tri_cxk.h rename to frame/1m/packm/bli_packm_struc_cxk.h index 63756cd34..f1e669c77 100644 --- a/frame/1m/packm/bli_packm_tri_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk.h @@ -51,15 +51,41 @@ void PASTEMAC(ch,varname)( \ ctype* restrict p, inc_t rs_p, inc_t cs_p \ ); -INSERT_GENTPROT_BASIC( packm_tri_cxk ) +INSERT_GENTPROT_BASIC( packm_struc_cxk ) -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ struc_t strucc, \ - doff_t diagoffp, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ); + +INSERT_GENTPROT_BASIC( packm_herm_cxk ) + + + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ @@ -68,12 +94,14 @@ void PASTEMAC(ch,varname)( \ dim_t n_panel, \ dim_t m_panel_max, \ dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ ); -INSERT_GENTPROTCO_BASIC( packm_tri_cxk_4m ) - -INSERT_GENTPROTCO_BASIC( packm_tri_cxk_3m ) +INSERT_GENTPROT_BASIC( packm_tri_cxk ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_3m.c b/frame/1m/packm/bli_packm_struc_cxk_3m.c new file mode 100644 index 000000000..cb90c1058 --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_3m.c @@ -0,0 +1,688 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t is_p, ldp; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the imaginary stride (ie: the element offset to the imaginary + panel). */ \ + is_p = ldp * panel_len_max; \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, is_p, ldp ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + is_p, ldp ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + is_p, ldp ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_rpi, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \ + ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_rpi, rs_p, cs_p ); \ + } \ +\ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = m_panel; \ + dim_t j = n_panel; \ + dim_t m_br = m_panel_max - i; \ + dim_t n_br = n_panel_max - j; \ + ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ + ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \ +\ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + one_r, \ + p_br_r, rs_p, cs_p ); \ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + zero_r, \ + p_br_i, rs_p, cs_p ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_3m, packm_cxk_3m ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ + ) \ +{ \ + doff_t diagoffc_abs; \ + dim_t i, j; \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Since we don't + have the schema in scope, we must use the dimensions and strides + of the micro-panel to determine whether it is row- or column- + stored. */ \ + row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \ + col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \ +\ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, is_p, ldp ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype_r* restrict p_r = ( ctype_r* )p; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype* restrict c10; \ + ctype_r* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype_r* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p_r; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p_r; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc10, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, is_p, ldp ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc12, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, is_p, ldp ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + inc_t cs_p11; \ + inc_t rs_p11; \ + dim_t p11_m = panel_dim; \ + dim_t p11_n = panel_dim; \ + inc_t rs_c11 = 2*rs_c; \ + inc_t cs_c11 = 2*cs_c; \ + dim_t j = diagoffc_abs; \ + ctype* c11 = ( ctype* )c + (j )*ldc; \ + ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \ + ctype_r* c11_r = ( ctype_r* )c11; \ + ctype_r* c11_i = ( ctype_r* )c11 + 1; \ + ctype_r* p11_r = ( ctype_r* )p11; \ + ctype_r* p11_i = ( ctype_r* )p11 + is_p; \ + ctype_r* alpha_r = one_r; \ + ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \ + ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ + ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ +\ + /* Compute the row and column strides of p11. */ \ + if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \ + else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \ +\ + /* Copy the real part of the stored triangle of c11 to p11_r. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_r, \ + c11_r, rs_c11, cs_c11, \ + p11_r, rs_p11, cs_p11 ); \ +\ + /* Copy the imaginary part of the stored triangle of c11 to p11_i, + scaling by -1 if conjugation on c was requested. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_i, \ + c11_i, rs_c11, cs_c11, \ + p11_i, rs_p11, cs_p11 ); \ +\ + /* If source matrix c is Hermitian, we have to zero out the + imaginary components of the diagonal of p11 in case the + corresponding elements in c11 were not already zero. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \ +\ + PASTEMAC(chr,set0s)( *pi11_i ); \ + } \ + } \ +\ + /* Apply kappa to the part of p11 that corresponds to the stored + part of c11 that was copied above. */ \ + if ( bli_is_upper( uploc ) ) \ + { \ + PASTEMAC(ch,scalris_mxn_u)( 0, \ + p11_m, \ + p11_n, \ + &kappa_r, \ + &kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ + else \ + { \ + PASTEMAC(ch,scalris_mxn_l)( 0, \ + p11_m, \ + p11_n, \ + &kappa_r, \ + &kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ +\ + /* Update the p11 section of the ri panel. It simply needs + to contain the sum of p11_r + p11_i. */ \ + { \ + ctype_r* p11_rpi = p11_i + is_p; \ +\ + for ( j = 0; j < p11_n; ++j ) \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \ + ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \ + ctype_r* pi11_rpi = p11_rpi + (i )*rs_p11 + (j )*cs_p11; \ +\ + PASTEMAC(chr,add3s)( *pi11_r, \ + *pi11_i, \ + *pi11_rpi ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_3m, packm_cxk_3m ) + + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ + ) \ +{ \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Since we don't + have the schema in scope, we must use the dimensions and strides + of the micro-panel to determine whether it is row- or column- + stored. */ \ + row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \ + col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, is_p, ldp ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + dim_t j = bli_abs( diagoffp ); \ + ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ + ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \ + ctype_r* p11_rpi = ( ctype_r* )p + 2*is_p + (j )*ldp; \ + inc_t cs_p11; \ + inc_t rs_p11; \ +\ + /* Compute the row and column strides of p11. */ \ + if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \ + else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \ +\ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ + ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ + dim_t i; \ +\ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + &kappa_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + &kappa_i, \ + p11_i, rs_p11, cs_p11 ); \ +\ + /* Update the diagonal of the p11 section of the rpi panel. + It simply needs to contain the sum of diagonals of p11_r + and p11_i. */ \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (i )*cs_p; \ +\ + PASTEMAC(chr,add3s)( *pi11_r, *pi11_i, *pi11_rpi ); \ + } \ + } \ +\ + /* If requested, invert the diagonal of the packed panel. Note + that we do not need to update the ri panel since inverted + diagonals are only needed by trsm, which does not use the + p11 section of the ri panel. */ \ + if ( invdiag == TRUE ) \ + { \ + dim_t i; \ +\ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ +\ + PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ + } \ + } \ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + uplo_t uplop11 = uploc; \ + doff_t diagoffp11 = 0; \ +\ + bli_toggle_uplo( uplop11 ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \ +\ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_i, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_rpi, rs_p11, cs_p11 ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_3m, packm_cxk_3m ) + diff --git a/frame/1m/packm/bli_packm_herm_cxk.h b/frame/1m/packm/bli_packm_struc_cxk_3m.h similarity index 65% rename from frame/1m/packm/bli_packm_herm_cxk.h rename to frame/1m/packm/bli_packm_struc_cxk_3m.h index b574f689e..e24a290d5 100644 --- a/frame/1m/packm/bli_packm_herm_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk_3m.h @@ -32,14 +32,16 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ struc_t strucc, \ - doff_t diagoffc, \ + doff_t diagoffp, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ + bool_t invdiag, \ dim_t m_panel, \ dim_t n_panel, \ dim_t m_panel_max, \ @@ -49,7 +51,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict p, inc_t rs_p, inc_t cs_p \ ); -INSERT_GENTPROT_BASIC( packm_herm_cxk ) +INSERT_GENTPROTCO_BASIC( packm_struc_cxk_3m ) @@ -65,11 +67,41 @@ void PASTEMAC(ch,varname)( \ dim_t n_panel, \ dim_t m_panel_max, \ dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ ); -INSERT_GENTPROTCO_BASIC( packm_herm_cxk_4m ) - INSERT_GENTPROTCO_BASIC( packm_herm_cxk_3m ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_3m ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_4m.c b/frame/1m/packm/bli_packm_struc_cxk_4m.c new file mode 100644 index 000000000..a11ff2e10 --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_4m.c @@ -0,0 +1,638 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t is_p, ldp; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the imaginary stride (ie: the element offset to the imaginary + panel). */ \ + is_p = ldp * panel_len_max; \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, is_p, ldp ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk_4m)( strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + is_p, ldp ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk_4m)( strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + is_p, ldp ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + } \ +\ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = m_panel; \ + dim_t j = n_panel; \ + dim_t m_br = m_panel_max - i; \ + dim_t n_br = n_panel_max - j; \ + ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ + ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \ +\ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + one_r, \ + p_br_r, rs_p, cs_p ); \ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + zero_r, \ + p_br_i, rs_p, cs_p ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_4m, packm_cxk_4m ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ + ) \ +{ \ + doff_t diagoffc_abs; \ + dim_t i, j; \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Since we don't + have the schema in scope, we must use the dimensions and strides + of the micro-panel to determine whether it is row- or column- + stored. */ \ + row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \ + col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \ +\ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, is_p, ldp ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype_r* restrict p_r = ( ctype_r* )p; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype* restrict c10; \ + ctype_r* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype_r* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p_r; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p_r; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc10, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, is_p, ldp ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc12, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, is_p, ldp ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + inc_t cs_p11; \ + inc_t rs_p11; \ + dim_t p11_m = panel_dim; \ + dim_t p11_n = panel_dim; \ + inc_t rs_c11 = 2*rs_c; \ + inc_t cs_c11 = 2*cs_c; \ + dim_t j = diagoffc_abs; \ + ctype* c11 = ( ctype* )c + (j )*ldc; \ + ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \ + ctype_r* c11_r = ( ctype_r* )c11; \ + ctype_r* c11_i = ( ctype_r* )c11 + 1; \ + ctype_r* p11_r = ( ctype_r* )p11; \ + ctype_r* p11_i = ( ctype_r* )p11 + is_p; \ + ctype_r* alpha_r = one_r; \ + ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \ + ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ + ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ +\ + /* Compute the row and column strides of p11. */ \ + if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \ + else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \ +\ + /* Copy the real part of the stored triangle of c11 to p11_r. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_r, \ + c11_r, rs_c11, cs_c11, \ + p11_r, rs_p11, cs_p11 ); \ +\ + /* Copy the imaginary part of the stored triangle of c11 to p11_i, + scaling by -1 if conjugation on c was requested. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_i, \ + c11_i, rs_c11, cs_c11, \ + p11_i, rs_p11, cs_p11 ); \ +\ + /* If source matrix c is Hermitian, we have to zero out the + imaginary components of the diagonal of p11 in case the + corresponding elements in c11 were not already zero. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \ +\ + PASTEMAC(chr,set0s)( *pi11_i ); \ + } \ + } \ +\ + /* Apply kappa to the part of p11 that corresponds to the stored + part of c11 that was copied above. */ \ + if ( bli_is_upper( uploc ) ) \ + { \ + PASTEMAC(ch,scalris_mxn_u)( 0, \ + p11_m, \ + p11_n, \ + &kappa_r, \ + &kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ + else \ + { \ + PASTEMAC(ch,scalris_mxn_l)( 0, \ + p11_m, \ + p11_n, \ + &kappa_r, \ + &kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ +/* + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ + p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ + p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_4m, packm_cxk_4m ) + + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ + ) \ +{ \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Since we don't + have the schema in scope, we must use the dimensions and strides + of the micro-panel to determine whether it is row- or column- + stored. */ \ + row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \ + col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, is_p, ldp ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + dim_t j = bli_abs( diagoffp ); \ + ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ + ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \ + inc_t cs_p11; \ + inc_t rs_p11; \ +\ + /* Compute the row and column strides of p11. */ \ + if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \ + else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \ +\ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ + ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ +\ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + &kappa_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + &kappa_i, \ + p11_i, rs_p11, cs_p11 ); \ + } \ +\ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + dim_t i; \ +\ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ +\ + PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ + } \ + } \ +\ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + uplo_t uplop11 = uploc; \ + doff_t diagoffp11 = 0; \ +\ + bli_toggle_uplo( uplop11 ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \ +\ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_4m, packm_cxk_4m ) + diff --git a/frame/1m/packm/bli_packm_gen_cxk.h b/frame/1m/packm/bli_packm_struc_cxk_4m.h similarity index 64% rename from frame/1m/packm/bli_packm_gen_cxk.h rename to frame/1m/packm/bli_packm_struc_cxk_4m.h index 7ffef14fa..4bffca572 100644 --- a/frame/1m/packm/bli_packm_gen_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk_4m.h @@ -32,14 +32,16 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ struc_t strucc, \ - doff_t diagoffc, \ + doff_t diagoffp, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ + bool_t invdiag, \ dim_t m_panel, \ dim_t n_panel, \ dim_t m_panel_max, \ @@ -49,7 +51,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict p, inc_t rs_p, inc_t cs_p \ ); -INSERT_GENTPROT_BASIC( packm_gen_cxk ) +INSERT_GENTPROTCO_BASIC( packm_struc_cxk_4m ) @@ -65,11 +67,41 @@ void PASTEMAC(ch,varname)( \ dim_t n_panel, \ dim_t m_panel_max, \ dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ ); -INSERT_GENTPROTCO_BASIC( packm_gen_cxk_4m ) +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_4m ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_4m ) -INSERT_GENTPROTCO_BASIC( packm_gen_cxk_3m ) diff --git a/frame/1m/packm/bli_packm_tri_cxk.c b/frame/1m/packm/bli_packm_tri_cxk.c deleted file mode 100644 index 1c2236f60..000000000 --- a/frame/1m/packm/bli_packm_tri_cxk.c +++ /dev/null @@ -1,720 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype* restrict zero = PASTEMAC(ch,0); \ -\ - dim_t panel_dim; \ - dim_t panel_len; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, ldp ); \ -\ -\ - /* If the diagonal of c is implicitly unit, explicitly set the - the diagonal of the packed panel to kappa. */ \ - if ( bli_is_unit_diag( diagc ) ) \ - { \ - PASTEMAC(ch,setd)( diagoffp, \ - m_panel, \ - n_panel, \ - kappa, \ - p, rs_p, cs_p ); \ - } \ -\ - /* If requested, invert the diagonal of the packed panel. */ \ - if ( invdiag == TRUE ) \ - { \ - PASTEMAC(ch,invertd)( diagoffp, \ - m_panel, \ - n_panel, \ - p, rs_p, cs_p ); \ - } \ -\ - /* Set the region opposite the diagonal of p to zero. To do this, - we need to reference the "unstored" region on the other side of - the diagonal. This amounts to toggling uploc and then shifting - the diagonal offset to shrink the newly referenced region (by - one diagonal). Note that this zero-filling is not needed for - trsm, since the unstored region is not referenced by the trsm - micro-kernel; however, zero-filling is needed for trmm, which - uses the gemm micro-kernel.*/ \ - { \ - uplo_t uplop = uploc; \ -\ - bli_toggle_uplo( uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \ -\ - PASTEMAC(ch,setm)( diagoffp, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - m_panel, \ - n_panel, \ - zero, \ - p, rs_p, cs_p ); \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype* p_edge = p + (i )*rs_p; \ -\ - PASTEMAC(ch,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype* p_edge = p + (j )*cs_p; \ -\ - PASTEMAC(ch,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ - } \ -\ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype* one = PASTEMAC(ch,1); \ - ctype* p_br = p + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC(ch,setd)( 0, \ - m_br, \ - n_br, \ - one, \ - p_br, rs_p, cs_p ); \ - } \ -/* - PASTEMAC(ch,fprintm)( stdout, "packm_var1: setting br unit diag", m_br, n_br, \ - p_edge, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ -/* - if ( rs_p == 1 ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ - if ( cs_p == 1 ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNC_BASIC0( packm_tri_cxk ) - - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ -\ - dim_t i; \ - dim_t panel_dim; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t psp, ldp; \ -\ - inc_t rs_p11, cs_p11; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - rs_p11 = rs_p; \ - cs_p11 = 1; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - rs_p11 = 1; \ - cs_p11 = cs_p; \ - } \ -\ - /* Compute the panel stride (ie: the element offset to the imaginary - panel). */ \ - psp = ldp * panel_len_max; \ -\ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_4m)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ -\ -\ - /* Tweak the panel according to its triangular structure */ \ - { \ - dim_t j = bli_abs( diagoffp ); \ - ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ - ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \ -\ - /* If the diagonal of c is implicitly unit, explicitly set the - the diagonal of the packed panel to kappa. */ \ - if ( bli_is_unit_diag( diagc ) ) \ - { \ - ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ - ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ -\ - PASTEMAC(chr,setd)( 0, \ - m_panel, \ - n_panel, \ - &kappa_r, \ - p11_r, rs_p11, cs_p11 ); \ - PASTEMAC(chr,setd)( 0, \ - m_panel, \ - n_panel, \ - &kappa_i, \ - p11_i, rs_p11, cs_p11 ); \ - } \ -\ - /* If requested, invert the diagonal of the packed panel. */ \ - if ( invdiag == TRUE ) \ - { \ - for ( i = 0; i < panel_dim; ++i ) \ - { \ - ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ -\ - PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ - } \ - } \ -\ - /* Set the region opposite the diagonal of p to zero. To do this, - we need to reference the "unstored" region on the other side of - the diagonal. This amounts to toggling uploc and then shifting - the diagonal offset to shrink the newly referenced region (by - one diagonal). Note that this zero-filling is not needed for - trsm, since the unstored region is not referenced by the trsm - micro-kernel; however, zero-filling is needed for trmm, which - uses the gemm micro-kernel.*/ \ - { \ - uplo_t uplop11 = uploc; \ - doff_t diagoffp11 = 0; \ -\ - bli_toggle_uplo( uplop11 ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \ -\ - PASTEMAC(chr,setm)( diagoffp11, \ - BLIS_NONUNIT_DIAG, \ - uplop11, \ - panel_dim, \ - panel_dim, \ - zero_r, \ - p11_r, rs_p11, cs_p11 ); \ - PASTEMAC(chr,setm)( diagoffp11, \ - BLIS_NONUNIT_DIAG, \ - uplop11, \ - panel_dim, \ - panel_dim, \ - zero_r, \ - p11_i, rs_p11, cs_p11 ); \ - } \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ -\ - } \ -\ -\ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ - ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC(chr,setd)( 0, \ - m_br, \ - n_br, \ - one_r, \ - p_br_r, rs_p, cs_p ); \ - PASTEMAC(chr,setd)( 0, \ - m_br, \ - n_br, \ - zero_r, \ - p_br_i, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_4m ) - - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffp, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p \ - ) \ -{ \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ -\ - dim_t i; \ - dim_t panel_dim; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t psp, ldp; \ -\ - inc_t rs_p11, cs_p11; \ -\ -\ - /* If the strides of p indicate row storage, then we are packing to - column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - rs_p11 = rs_p; \ - cs_p11 = 1; \ - } \ - else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - rs_p11 = 1; \ - cs_p11 = cs_p; \ - } \ -\ - /* Compute the panel stride (ie: the element offset to the imaginary - panel). */ \ - psp = ldp * panel_len_max; \ -\ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,packm_cxk_3m)( conjc, \ - panel_dim, \ - panel_len, \ - kappa, \ - c, incc, ldc, \ - p, psp, ldp ); \ -\ -\ - /* Tweak the panel according to its triangular structure */ \ - { \ - dim_t j = bli_abs( diagoffp ); \ - ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ - ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \ - ctype_r* p11_rpi = ( ctype_r* )p + 2*psp + (j )*ldp; \ -\ - /* If the diagonal of c is implicitly unit, explicitly set the - the diagonal of the packed panel to kappa. */ \ - if ( bli_is_unit_diag( diagc ) ) \ - { \ - ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \ - ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \ -\ - PASTEMAC(chr,setd)( 0, \ - m_panel, \ - n_panel, \ - &kappa_r, \ - p11_r, rs_p11, cs_p11 ); \ - PASTEMAC(chr,setd)( 0, \ - m_panel, \ - n_panel, \ - &kappa_i, \ - p11_i, rs_p11, cs_p11 ); \ - PASTEMAC(chr,setd)( 0, \ - m_panel, \ - n_panel, \ - &kappa_r, \ - p11_rpi, rs_p11, cs_p11 ); \ - } \ -\ - /* If requested, invert the diagonal of the packed panel. Note - that we do not need to update the ri panel since inverted - diagonals are only needed by trsm, which does not use the - p11 section of the ri panel. */ \ - if ( invdiag == TRUE ) \ - { \ - for ( i = 0; i < panel_dim; ++i ) \ - { \ - ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ - ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ -\ - PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ - } \ - } \ -\ - /* Set the region opposite the diagonal of p to zero. To do this, - we need to reference the "unstored" region on the other side of - the diagonal. This amounts to toggling uploc and then shifting - the diagonal offset to shrink the newly referenced region (by - one diagonal). Note that this zero-filling is not needed for - trsm, since the unstored region is not referenced by the trsm - micro-kernel; however, zero-filling is needed for trmm, which - uses the gemm micro-kernel.*/ \ - { \ - uplo_t uplop11 = uploc; \ - doff_t diagoffp11 = 0; \ -\ - bli_toggle_uplo( uplop11 ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \ -\ - PASTEMAC(chr,setm)( diagoffp11, \ - BLIS_NONUNIT_DIAG, \ - uplop11, \ - panel_dim, \ - panel_dim, \ - zero_r, \ - p11_r, rs_p11, cs_p11 ); \ - PASTEMAC(chr,setm)( diagoffp11, \ - BLIS_NONUNIT_DIAG, \ - uplop11, \ - panel_dim, \ - panel_dim, \ - zero_r, \ - p11_i, rs_p11, cs_p11 ); \ - PASTEMAC(chr,setm)( diagoffp11, \ - BLIS_NONUNIT_DIAG, \ - uplop11, \ - panel_dim, \ - panel_dim, \ - zero_r, \ - p11_rpi, rs_p11, cs_p11 ); \ - } \ - } \ -\ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - either region exists, we set them to zero. This allows the - micro-kernel to remain simple since it does not need to support - different register blockings for the edge cases. */ \ - if ( m_panel != m_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ - ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ - ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \ -\ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_r, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_i, rs_p, cs_p ); \ - PASTEMAC(chr,setm)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero_r, \ - p_edge_rpi, rs_p, cs_p ); \ - } \ -\ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ - ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC(chr,setd)( 0, \ - m_br, \ - n_br, \ - one_r, \ - p_br_r, rs_p, cs_p ); \ - PASTEMAC(chr,setd)( 0, \ - m_br, \ - n_br, \ - zero_r, \ - p_br_i, rs_p, cs_p ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_3m ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.c index 31ddc5c24..565352ab8 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.c @@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -54,8 +54,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -133,7 +133,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -145,8 +145,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -232,7 +232,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -244,8 +244,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -339,7 +339,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -351,8 +351,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -454,7 +454,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -466,8 +466,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -577,7 +577,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -589,8 +589,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -708,7 +708,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -720,8 +720,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -847,7 +847,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -859,8 +859,8 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ + ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.h index 2912570ad..8cc3d04d5 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_3m.h @@ -40,7 +40,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ); INSERT_GENTPROT_BASIC( packm_ref_2xk_3m ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.c index 817c54c07..857709584 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.c @@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -128,7 +128,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -140,7 +140,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -222,7 +222,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -234,7 +234,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -324,7 +324,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -336,7 +336,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -434,7 +434,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -446,7 +446,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -552,7 +552,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -564,7 +564,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -678,7 +678,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -690,7 +690,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ @@ -812,7 +812,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ @@ -824,7 +824,7 @@ void PASTEMAC(ch,varname)( \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.h index dc67abbaf..fb3da450a 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_4m.h @@ -40,7 +40,7 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ void* kappa, \ void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ + void* p, inc_t is_p, inc_t ldp \ ); INSERT_GENTPROT_BASIC( packm_ref_2xk_4m ) diff --git a/frame/3/gemm/3m/bli_gemm3m_cntl.c b/frame/3/gemm/3m/bli_gemm3m_cntl.c index 1f12f323f..3e42a3d34 100644 --- a/frame/3/gemm/3m/bli_gemm3m_cntl.c +++ b/frame/3/gemm/3m/bli_gemm3m_cntl.c @@ -119,7 +119,7 @@ void bli_gemm3m_cntl_init() gemm3m_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, gemm3m_mr, gemm3m_kr, TRUE, // densify; used by hemm/symm @@ -132,7 +132,7 @@ void bli_gemm3m_cntl_init() gemm3m_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, gemm3m_kr, gemm3m_nr, TRUE, // densify; used by hemm/symm diff --git a/frame/3/gemm/4m/bli_gemm4m_cntl.c b/frame/3/gemm/4m/bli_gemm4m_cntl.c index 3f83cedd3..262660cd2 100644 --- a/frame/3/gemm/4m/bli_gemm4m_cntl.c +++ b/frame/3/gemm/4m/bli_gemm4m_cntl.c @@ -119,7 +119,7 @@ void bli_gemm4m_cntl_init() gemm4m_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, gemm4m_mr, gemm4m_kr, TRUE, // densify; used by hemm/symm @@ -132,7 +132,7 @@ void bli_gemm4m_cntl_init() gemm4m_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, gemm4m_kr, gemm4m_nr, TRUE, // densify; used by hemm/symm diff --git a/frame/3/herk/3m/bli_herk3m_cntl.c b/frame/3/herk/3m/bli_herk3m_cntl.c index c313d907d..a3ee24341 100644 --- a/frame/3/herk/3m/bli_herk3m_cntl.c +++ b/frame/3/herk/3m/bli_herk3m_cntl.c @@ -62,7 +62,7 @@ void bli_herk3m_cntl_init() herk3m_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, gemm3m_mr, gemm3m_kr, FALSE, // already dense; densify not necessary @@ -75,7 +75,7 @@ void bli_herk3m_cntl_init() herk3m_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, gemm3m_kr, gemm3m_nr, FALSE, // already dense; densify not necessary diff --git a/frame/3/herk/4m/bli_herk4m_cntl.c b/frame/3/herk/4m/bli_herk4m_cntl.c index 61b029b33..ad4cad40f 100644 --- a/frame/3/herk/4m/bli_herk4m_cntl.c +++ b/frame/3/herk/4m/bli_herk4m_cntl.c @@ -62,7 +62,7 @@ void bli_herk4m_cntl_init() herk4m_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, gemm4m_mr, gemm4m_kr, FALSE, // already dense; densify not necessary @@ -75,7 +75,7 @@ void bli_herk4m_cntl_init() herk4m_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, gemm4m_kr, gemm4m_nr, FALSE, // already dense; densify not necessary diff --git a/frame/3/trmm/3m/bli_trmm3m_cntl.c b/frame/3/trmm/3m/bli_trmm3m_cntl.c index 953ec75d3..b24460e8d 100644 --- a/frame/3/trmm/3m/bli_trmm3m_cntl.c +++ b/frame/3/trmm/3m/bli_trmm3m_cntl.c @@ -73,7 +73,7 @@ void bli_trmm3m_cntl_init() trmm3m_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, // IMPORTANT: for consistency with trsm, "k" dim // multiple is set to mr. gemm3m_mr, @@ -88,7 +88,7 @@ void bli_trmm3m_cntl_init() trmm3m_l_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, // IMPORTANT: m dim multiple here must be mr // since "k" dim multiple is set to mr above. gemm3m_mr, @@ -104,7 +104,7 @@ void bli_trmm3m_cntl_init() trmm3m_r_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, // IMPORTANT: for consistency with trsm, "k" dim // multiple is set to nr. gemm3m_mr, @@ -119,7 +119,7 @@ void bli_trmm3m_cntl_init() trmm3m_r_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, // IMPORTANT: m dim multiple here must be nr // since "k" dim multiple is set to nr above. gemm3m_nr, diff --git a/frame/3/trmm/4m/bli_trmm4m_cntl.c b/frame/3/trmm/4m/bli_trmm4m_cntl.c index f7d30b31c..0876f2f62 100644 --- a/frame/3/trmm/4m/bli_trmm4m_cntl.c +++ b/frame/3/trmm/4m/bli_trmm4m_cntl.c @@ -73,7 +73,7 @@ void bli_trmm4m_cntl_init() trmm4m_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, // IMPORTANT: for consistency with trsm, "k" dim // multiple is set to mr. gemm4m_mr, @@ -88,7 +88,7 @@ void bli_trmm4m_cntl_init() trmm4m_l_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, // IMPORTANT: m dim multiple here must be mr // since "k" dim multiple is set to mr above. gemm4m_mr, @@ -104,7 +104,7 @@ void bli_trmm4m_cntl_init() trmm4m_r_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, // IMPORTANT: for consistency with trsm, "k" dim // multiple is set to nr. gemm4m_mr, @@ -119,7 +119,7 @@ void bli_trmm4m_cntl_init() trmm4m_r_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, // IMPORTANT: m dim multiple here must be nr // since "k" dim multiple is set to nr above. gemm4m_nr, diff --git a/frame/3/trsm/3m/bli_trsm3m_cntl.c b/frame/3/trsm/3m/bli_trsm3m_cntl.c index d40c7b44c..71d05880c 100644 --- a/frame/3/trsm/3m/bli_trsm3m_cntl.c +++ b/frame/3/trsm/3m/bli_trsm3m_cntl.c @@ -92,7 +92,7 @@ void bli_trsm3m_cntl_init() trsm3m_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases gemm3m_mr, @@ -107,7 +107,7 @@ void bli_trsm3m_cntl_init() trsm3m_l_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, // IMPORTANT: m dim multiple must be mr since // B_pack is updated (ie: serves as C) in trsm gemm3m_mr, @@ -123,7 +123,7 @@ void bli_trsm3m_cntl_init() trsm3m_r_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, gemm3m_nr, gemm3m_mr, FALSE, // already dense; densify not necessary @@ -136,7 +136,7 @@ void bli_trsm3m_cntl_init() trsm3m_r_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, + BLIS_VARIANT2, gemm3m_mr, gemm3m_mr, TRUE, // densify diff --git a/frame/3/trsm/4m/bli_trsm4m_cntl.c b/frame/3/trsm/4m/bli_trsm4m_cntl.c index 9fb9e9251..f92204539 100644 --- a/frame/3/trsm/4m/bli_trsm4m_cntl.c +++ b/frame/3/trsm/4m/bli_trsm4m_cntl.c @@ -93,7 +93,7 @@ void bli_trsm4m_cntl_init() trsm4m_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases gemm4m_mr, @@ -108,7 +108,7 @@ void bli_trsm4m_cntl_init() trsm4m_l_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, // IMPORTANT: m dim multiple must be mr since // B_pack is updated (ie: serves as C) in trsm gemm4m_mr, @@ -124,7 +124,7 @@ void bli_trsm4m_cntl_init() trsm4m_r_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, gemm4m_nr, gemm4m_mr, FALSE, // already dense; densify not necessary @@ -137,7 +137,7 @@ void bli_trsm4m_cntl_init() trsm4m_r_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, + BLIS_VARIANT2, gemm4m_mr, gemm4m_mr, TRUE, // densify diff --git a/frame/include/bli_kernel_type_defs.h b/frame/include/bli_kernel_type_defs.h index 2fdf222cc..ae9a72e1a 100644 --- a/frame/include/bli_kernel_type_defs.h +++ b/frame/include/bli_kernel_type_defs.h @@ -101,5 +101,31 @@ typedef void \ INSERT_GENTPROT_BASIC( gemmtrsm_ukr_t ) +// -- packm_struc_cxk kernel -- + +#undef GENTPROT +#define GENTPROT( ctype, ch, tname ) \ +\ +typedef void \ +(*PASTECH(ch,tname))( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROT_BASIC( packm_ker_t ) + + + #endif