diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index fbab40f5b..6fecc035c 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -47,8 +47,10 @@ #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_4m.h" #include "bli_packm_struc_cxk_3m.h" +#include "bli_packm_struc_cxk_rih.h" #include "bli_packm_cxk.h" #include "bli_packm_cxk_4m.h" #include "bli_packm_cxk_3m.h" +#include "bli_packm_cxk_rih.h" diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index 39c2e1179..5240c60c0 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -63,6 +63,7 @@ typedef void (*FUNCPTR_T)( extern func_t* packm_struc_cxk_kers; extern func_t* packm_struc_cxk_4m_kers; extern func_t* packm_struc_cxk_3m_kers; +extern func_t* packm_struc_cxk_rih_kers; void bli_packm_blk_var2( obj_t* c, @@ -153,6 +154,9 @@ void bli_packm_blk_var2( obj_t* c, // Choose the correct func_t object based on the pack_t schema. if ( bli_is_4m_packed( schema ) ) packm_kers = packm_struc_cxk_4m_kers; else if ( bli_is_3m_packed( schema ) ) packm_kers = packm_struc_cxk_3m_kers; + else if ( bli_is_ro_packed( schema ) || + bli_is_io_packed( schema ) || + bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 37e29708d..0147cc2f1 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -40,6 +40,7 @@ blksz_t* packm_mult_nvec; func_t* packm_struc_cxk_kers; func_t* packm_struc_cxk_4m_kers; func_t* packm_struc_cxk_3m_kers; +func_t* packm_struc_cxk_rih_kers; packm_t* packm_cntl_row; packm_t* packm_cntl_col; @@ -74,6 +75,13 @@ void bli_packm_cntl_init() bli_cpackm_struc_cxk_3m, FALSE, bli_zpackm_struc_cxk_3m, FALSE ); + packm_struc_cxk_rih_kers + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + bli_cpackm_struc_cxk_rih, FALSE, + bli_zpackm_struc_cxk_rih, FALSE ); + // Create blocksize objects for m and n register blocking. We will attach // these to the packm control node so they can be used to (a) allocate a @@ -146,6 +154,7 @@ void bli_packm_cntl_finalize() bli_func_obj_free( packm_struc_cxk_kers ); bli_func_obj_free( packm_struc_cxk_4m_kers ); bli_func_obj_free( packm_struc_cxk_3m_kers ); + bli_func_obj_free( packm_struc_cxk_rih_kers ); bli_cntl_obj_free( packm_cntl_row ); bli_cntl_obj_free( packm_cntl_col ); diff --git a/frame/1m/packm/bli_packm_cxk_rih.c b/frame/1m/packm/bli_packm_cxk_rih.c new file mode 100644 index 000000000..21b711e64 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_rih.c @@ -0,0 +1,290 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_cxk_fp + +typedef void (*FUNCPTR_T)( + conj_t conja, + pack_t schema, + dim_t panel_len, + void* kappa, + void* a, inc_t inca, inc_t lda, + void* p, inc_t ldp + ); + +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 18 + +static FUNCPTR_T ftypes_rih[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_2XK_RIH_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_4XK_RIH_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_6XK_RIH_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_8XK_RIH_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_10XK_RIH_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_12XK_RIH_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_14XK_RIH_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_16XK_RIH_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, +}; + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + num_t dt; \ + FUNCPTR_T f; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the micro-panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes_rih[panel_dim][dt]; \ + else f = NULL; \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + if ( f != NULL ) \ + { \ + f( conja, \ + schema, \ + panel_len, \ + kappa, \ + a, inca, lda, \ + p, ldp ); \ + } \ + else \ + { \ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict a_r = ( ctype* )a; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + const dim_t inca1 = inca; \ + const dim_t lda1 = lda; \ + const dim_t ldp1 = ldp; \ + dim_t i, j; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2jros)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2ros)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2jios)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2ios)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2rpis)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_cxk_rih ) + diff --git a/frame/1m/packm/bli_packm_cxk_rih.h b/frame/1m/packm/bli_packm_cxk_rih.h new file mode 100644 index 000000000..5106b7b03 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_rih.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_packm_ref_cxk_rih.h" + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_cxk_rih ) + diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 1565b8921..876920691 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -332,6 +332,10 @@ void bli_packm_init_pack( invdiag_t invert_diag, if ( bli_is_3m_packed( pack_schema ) ) ps_p = ( ps_p * 3 ) / 2; + else if ( bli_is_ro_packed( pack_schema ) || + bli_is_io_packed( pack_schema ) || + bli_is_rpi_packed( pack_schema ) ) + ps_p = ps_p / 2; // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); @@ -373,6 +377,10 @@ void bli_packm_init_pack( invdiag_t invert_diag, if ( bli_is_3m_packed( pack_schema ) ) ps_p = ( ps_p * 3 ) / 2; + else if ( bli_is_ro_packed( pack_schema ) || + bli_is_io_packed( pack_schema ) || + bli_is_rpi_packed( pack_schema ) ) + ps_p = ps_p / 2; // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index fcdfd943f..37aef3d6b 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -389,41 +389,46 @@ void PASTEMAC(ch,varname)( \ \ /* Pack the stored triangle of c11 to p11. */ \ { \ - ctype* restrict c11; \ - ctype* restrict p11; \ - dim_t p11_m; \ - dim_t p11_n; \ + dim_t p11_m = panel_dim; \ + dim_t p11_n = panel_dim; \ + dim_t j = diagoffc_abs; \ + ctype* restrict c11 = c + (j )*ldc; \ + ctype* restrict p11 = p + (j )*ldp; \ \ - p11_m = panel_dim; \ - p11_n = panel_dim; \ - j = diagoffc_abs; \ - p11 = p + (j )*ldp; \ - c11 = c + (j )*ldc; \ -\ - PASTEMAC(ch,scal2m)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - conjc, \ - p11_m, \ - p11_n, \ - kappa, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p ); \ + PASTEMAC(ch,copym)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + conjc, \ + p11_m, \ + p11_n, \ + c11, rs_c, cs_c, \ + p11, rs_p, cs_p ); \ \ /* If source matrix c is Hermitian, we have to zero out the imaginary components of the diagonal of p11 in case the corresponding elements in c11 were not already zero. */ \ if ( bli_is_hermitian( strucc ) ) \ { \ - /* NOTE: We can directly increment p11 since we are done - using p11 for the remainder of the function. */ \ + ctype* restrict pi11 = p11; \ +\ for ( i = 0; i < p11_m; ++i ) \ { \ - PASTEMAC(ch,seti0s)( *p11 ); \ + PASTEMAC(ch,seti0s)( *pi11 ); \ \ - p11 += rs_p + cs_p; \ + pi11 += rs_p + cs_p; \ } \ } \ +\ + /* Now that the diagonal has been made explicitly Hermitian + (if applicable), we can now safely scale the stored + triangle specified by uploc. */ \ + PASTEMAC(ch,scalm)( BLIS_NO_CONJUGATE, \ + 0, \ + uploc, \ + p11_m, \ + p11_n, \ + kappa, \ + p11, rs_p, cs_p ); \ } \ } \ } diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.c b/frame/1m/packm/bli_packm_struc_cxk_rih.c new file mode 100644 index 000000000..2a15e7cc1 --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.c @@ -0,0 +1,532 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* Determine the dimensions and relative strides of the micro-panel + based on its pack schema. */ \ + if ( bli_is_col_packed( schema ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_row_packed( schema ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername)( conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk_rih)( strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + schema, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk_rih)( strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + } \ +\ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + /* We don't need this case if we aren't supporting trsm. + Why? Because trmm's packm control tree node should be + using k dimension multiples of 1 (kr == 1), which means + there will never be zero padding at the far end of a + micro-panel. */ \ + } \ + } \ +\ +\ +/* + { \ + if ( bli_is_col_packed( schema ) ) \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: bp copied", m_panel_max, n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + else if ( bli_is_row_packed( schema ) ) \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: ap copied", m_panel_max, n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ + \ +\ +} + +INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_rih, packm_cxk_rih ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ) \ +{ \ + bool_t row_stored; \ + bool_t col_stored; \ + doff_t diagoffc_abs; \ + dim_t j; \ +\ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + row_stored = bli_is_col_packed( schema ); \ + col_stored = bli_is_row_packed( schema ); \ +\ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype_r* restrict p_r = ( ctype_r* )p; \ +\ + ctype* restrict c10; \ + ctype_r* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype_r* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p_r; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p_r; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc10, \ + schema, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, ldp ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc12, \ + schema, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + dim_t j = diagoffc_abs; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + ctype* restrict c11 = c + (j )*ldc; \ + ctype_r* restrict p11_r = p_r + (j )*ldp; \ +\ + PASTEMAC(ch,scal2rihs_mxn_uplo)( schema, \ + uploc, \ + conjc, \ + panel_dim, \ + kappa, \ + c11, rs_c, cs_c, \ + p11_r, rs_p, cs_p ); \ +\ + /* If we are packing a micro-panel with Hermitian structure, + we must take special care of the diagonal. Now, if kappa + were guaranteed to be unit, all we would need to do is + explicitly zero out the imaginary part of the diagonal of + p11, in case the diagonal of the source matrix contained + garbage (non-zero) imaginary values. HOWEVER, since kappa + can be non-unit, things become a little more complicated. + In general, we must re-apply the kappa scalar to ONLY the + real part of the diagonal of the source matrix and save + the result to the diagonal of p11. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + PASTEMAC3(ch,chr,ch,scal2rihs_mxn_diag)( schema, \ + panel_dim, \ + panel_dim, \ + kappa, \ + c11, rs_c, cs_c, \ + p11_r, rs_p, cs_p ); \ + } \ +\ +/* + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ + p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ + p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_rih, packm_cxk_rih ) + + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ) \ +{ \ + /* Pack the panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + ctype_r* p_r = ( ctype_r* )p; \ +\ + dim_t j = bli_abs( diagoffp ); \ + ctype_r* p11_r = p_r + (j )*ldp; \ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + PASTEMAC(ch,setrihs_mxn_diag)( schema, \ + panel_dim, \ + panel_dim, \ + kappa, \ + p11_r, rs_p, cs_p ); \ + } \ +\ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + /* We don't need this case if we aren't supporting trsm. */ \ + } \ +\ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). */ \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + uplo_t uplop = uploc; \ +\ + bli_toggle_uplo( uplop ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \ +\ + PASTEMAC(chr,setm)( diagoffp, \ + BLIS_NONUNIT_DIAG, \ + uplop, \ + m_panel, \ + n_panel, \ + zero_r, \ + p_r, rs_p, cs_p ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_rih, packm_cxk_rih ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.h b/frame/1m/packm/bli_packm_struc_cxk_rih.h new file mode 100644 index 000000000..87e5dcead --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.h @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROTCO_BASIC( packm_struc_cxk_rih ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_rih ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_rih ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.c new file mode 100644 index 000000000..5b52d8adb --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.c @@ -0,0 +1,2082 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +14*inca2), -*(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +15*inca2), -*(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_rih ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.h new file mode 100644 index 000000000..d537aa8d5 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ); + +INSERT_GENTPROT_BASIC( packm_ref_2xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_4xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_6xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_8xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_10xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_12xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_14xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_16xk_rih ) + diff --git a/frame/3/gemm/3m/bli_gemm3m_cntl.c b/frame/3/gemm/3m/bli_gemm3m_cntl.c index 081e54e63..0988c87f8 100644 --- a/frame/3/gemm/3m/bli_gemm3m_cntl.c +++ b/frame/3/gemm/3m/bli_gemm3m_cntl.c @@ -70,38 +70,38 @@ void bli_gemm3m_cntl_init() // blocksizes. gemm3m_mc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm3m_nc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm3m_kc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm3m_mr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm3m_nr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm3m_kr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); diff --git a/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c index 6336a32d8..2e5f8c91c 100644 --- a/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c +++ b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c @@ -47,25 +47,17 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ) \ { \ - ctype_r ct_r[ PASTEMAC(chr,mr) * \ - PASTEMAC(chr,nr) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ct_i[ PASTEMAC(chr,mr) * \ - PASTEMAC(chr,nr) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = 1; \ - const inc_t cs_ct = PASTEMAC(chr,mr); \ -\ -\ ctype_r ab_r[ PASTEMAC(chr,mr) * \ PASTEMAC(chr,nr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ ctype_r ab_i[ PASTEMAC(chr,mr) * \ PASTEMAC(chr,nr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = PASTEMAC(chr,mr); \ -\ + ctype_r ab_rpi[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ab; \ + inc_t cs_ab; \ \ const dim_t m = PASTEMAC(chr,mr); \ const dim_t n = PASTEMAC(chr,nr); \ @@ -75,29 +67,28 @@ void PASTEMAC(ch,varname)( \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ps_a; \ - ctype_r* restrict a_ri = ( ctype_r* )a + 2*ps_a; \ + ctype_r* restrict a_rpi = ( ctype_r* )a + 2*ps_a; \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*ps_b; \ + ctype_r* restrict b_rpi = ( ctype_r* )b + 2*ps_b; \ \ - ctype_r* restrict c_r = ( ctype_r* )c; \ - ctype_r* restrict c_i = ( ctype_r* )c + 1; \ -\ - const inc_t rs_c2 = 2 * rs_c; \ - const inc_t cs_c2 = 2 * cs_c; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ \ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ - ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ \ void* a_next = bli_auxinfo_next_a( data ); \ void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incab, ldab; \ \ dim_t i, j; \ \ @@ -106,64 +97,54 @@ void PASTEMAC(ch,varname)( \ allow an alpha with non-zero imaginary component to be passed in, because it can't be applied properly using the 3m method. If alpha is not real, then something is very wrong. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ \ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ { \ - /* We can handle a non-zero imaginary component on beta, but to do - so we have to manually scale c and then use beta == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scal2ris)( beta_r, \ - beta_i, \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2), \ - *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct) ); \ -\ - /* Use beta.r == 1.0. */ \ - beta_r = *one_r; \ + rs_ab = n; n_iter = m; incc = cs_c; \ + cs_ab = 1; n_elem = n; ldc = rs_c; \ } \ - else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + else /* column-stored or general stride */ \ { \ - /* Copy c to ct without scaling. */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2), \ - *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct) ); \ - } \ - else \ - { \ - /* Since beta is zero, ct can remain uninitialized since it - will be overwritten by the micro-kernel. */ \ + rs_ab = 1; n_iter = n; incc = rs_c; \ + cs_ab = m; n_elem = m; ldc = cs_c; \ } \ + incab = 1; \ + ldab = n_elem; \ \ \ - /* c.r = beta.r * c.r + a.r * b.r - a.i * b.i; - c.i = beta.r * c.i + (a.r + a.i)(b.r + b.i) - a.r * b.r - a.i * b.i; */ \ + /* The following gemm micro-kernel calls implement all "phases" of the + 3m method: + + c = beta * c; + c_r += + a_r * b_r - a_i * b_i; + c_i += (a_r + a_i)(b_r + b_i) - a_r * b_r - a_i * b_i; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ \ bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ \ - /* ab.r = alpha.r * a.r * b.r; */ \ + /* ab_r = alpha_r * a_r * b_r; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_r, \ b_r, \ zero_r, \ ab_r, rs_ab, cs_ab, \ data ); \ \ - bli_auxinfo_set_next_ab( a_ri, b_ri, *data ); \ + bli_auxinfo_set_next_ab( a_rpi, b_rpi, *data ); \ \ - /* ab.i = alpha.r * a.i * b.i; */ \ + /* ab_i = alpha_r * a_i * b_i; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_i, \ b_i, \ zero_r, \ @@ -172,47 +153,158 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ \ - /* ct.i = alpha.r * a.ri * b.ri; */ \ + /* ct_i = alpha_r * a_ri * b_ri; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ - a_ri, \ - b_ri, \ - &beta_r, \ - ct_i, rs_ct, cs_ct, \ + alpha_r, \ + a_rpi, \ + b_rpi, \ + zero_r, \ + ab_rpi, rs_ab, cs_ab, \ data ); \ \ \ - /* ct.r = beta.r * ct.r + ab.r; - ct.r = ct.r - ab.i; - ct.i = ct.i - ab.r; - ct.i = ct.i - ab.i; */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ + /* How we accumulate the intermediate matrix products stored in ab_r, + ab_i, and ab_rpi depends on the value of beta. */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r gammat_r = *(ct_r + i*rs_ct + j*cs_ct); \ - ctype_r gammat_i = *(ct_i + i*rs_ct + j*cs_ct); \ -\ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ + /* c = beta * c; + c_r = c_r + ab_r - ab_i; + c_i = c_i + ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ { \ - PASTEMAC(chr,copys)( *zero_r, gammat_r ); \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ +\ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,xpbyris)( gamma11t_r, \ + gamma11t_i, \ + beta_r, \ + beta_i, \ + *gamma11_r, \ + *gamma11_i ); \ } \ - else \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ab_r - ab_i; + c_i = c_i + ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ { \ - PASTEMAC(chr,scals)( beta_r, gammat_r ); \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ +\ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,addris)( gamma11t_r, \ + gamma11t_i, \ + *gamma11_r, \ + *gamma11_i ); \ } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ab_r - ab_i; + c_i = beta_r * c_i + ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ \ - PASTEMAC(chr,adds)( alphabeta_r, gammat_r ); \ - PASTEMAC(chr,subs)( alphabeta_i, gammat_r ); \ - PASTEMAC(chr,subs)( alphabeta_r, gammat_i ); \ - PASTEMAC(chr,subs)( alphabeta_i, gammat_i ); \ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ \ - /* Store the local values (from ct) back to c. */ \ - PASTEMAC(ch,copyris)( gammat_r, \ - gammat_i, \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t_r, beta_r, *gamma11_r ); \ + PASTEMAC(chr,xpbys)( gamma11t_i, beta_r, *gamma11_i ); \ + } \ + } \ + else /* if ( PASTEMAC(chr,eq0)( beta_r ) ) */ \ + { \ + /* c_r = ab_r - ab_i; + c_i = ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ +\ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,copyris)( gamma11t_r, \ + gamma11t_i, \ + *gamma11_r, \ + *gamma11_i ); \ + } \ } \ } diff --git a/frame/3/gemm/3mh/bli_gemm3mh.c b/frame/3/gemm/3mh/bli_gemm3mh.c new file mode 100644 index 000000000..b7450bd1a --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh.c @@ -0,0 +1,101 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_gemm3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_gemm3mh_entry( alpha, a, b, beta, c ); + else + bli_gemm_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( gemm3mh, gemm3mh ) + diff --git a/frame/3/gemm/3mh/bli_gemm3mh.h b/frame/3/gemm/3mh/bli_gemm3mh.h new file mode 100644 index 000000000..0d236a39e --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_gemm3mh_cntl.h" +#include "bli_gemm3mh_entry.h" + +#include "bli_gemm3mh_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_gemm3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemm3mh ) + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_cntl.c b/frame/3/gemm/3mh/bli_gemm3mh_cntl.c new file mode 100644 index 000000000..ec81d5b2d --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_cntl.c @@ -0,0 +1,402 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +blksz_t* gemm3mh_mc; +blksz_t* gemm3mh_nc; +blksz_t* gemm3mh_kc; +blksz_t* gemm3mh_mr; +blksz_t* gemm3mh_nr; +blksz_t* gemm3mh_kr; + +func_t* gemm3mh_ukrs; + +packm_t* gemm3mh_packa_cntl_ro; +packm_t* gemm3mh_packb_cntl_ro; +packm_t* gemm3mh_packa_cntl_io; +packm_t* gemm3mh_packb_cntl_io; +packm_t* gemm3mh_packa_cntl_rpi; +packm_t* gemm3mh_packb_cntl_rpi; + +gemm_t* gemm3mh_cntl_bp_ke; +gemm_t* gemm3mh_cntl_op_bp_ro; +gemm_t* gemm3mh_cntl_mm_op_ro; +gemm_t* gemm3mh_cntl_vl_mm_ro; +gemm_t* gemm3mh_cntl_op_bp_io; +gemm_t* gemm3mh_cntl_mm_op_io; +gemm_t* gemm3mh_cntl_vl_mm_io; +gemm_t* gemm3mh_cntl_op_bp_rpi; +gemm_t* gemm3mh_cntl_mm_op_rpi; +gemm_t* gemm3mh_cntl_vl_mm_rpi; + +gemm_t* gemm3mh_cntl_ro; +gemm_t* gemm3mh_cntl_io; +gemm_t* gemm3mh_cntl_rpi; + + +void bli_gemm3mh_cntl_init() +{ + // Create blocksize objects for each dimension. + // NOTE: the complex blocksizes for 3mh are equal to their + // corresponding real domain counterparts. + gemm3mh_mc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, + BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); + gemm3mh_nc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, + BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); + gemm3mh_kc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, + BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); + gemm3mh_mr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, + BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); + gemm3mh_nr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, + BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); + gemm3mh_kr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, + BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); + + + // Attach the register blksz_t objects as sub-blocksizes to the cache + // blksz_t objects. + bli_blksz_obj_attach_to( gemm3mh_mr, gemm3mh_mc ); + bli_blksz_obj_attach_to( gemm3mh_nr, gemm3mh_nc ); + bli_blksz_obj_attach_to( gemm3mh_kr, gemm3mh_kc ); + + + // Create function pointer object for each datatype-specific gemm + // micro-kernel. + gemm3mh_ukrs + = + bli_func_obj_create( + NULL, FALSE, + NULL, FALSE, + BLIS_CGEMM3MH_UKERNEL, BLIS_CGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_ZGEMM3MH_UKERNEL, BLIS_ZGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS ); + + + // Create control tree objects for packm operations (real only). + gemm3mh_packa_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_mr, + gemm3mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_RO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm3mh_packb_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_kr, + gemm3mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_RO, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (imag only). + gemm3mh_packa_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_mr, + gemm3mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_IO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm3mh_packb_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_kr, + gemm3mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_IO, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (real+imag). + gemm3mh_packa_cntl_rpi + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_mr, + gemm3mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_RPI, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm3mh_packb_cntl_rpi + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_kr, + gemm3mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_RPI, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + gemm3mh_cntl_bp_ke + = + bli_gemm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm3mh_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // + // Create control tree for A.real * B.real. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real x real) + gemm3mh_cntl_op_bp_ro + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3mh_mc, + gemm3mh_ukrs, + NULL, + gemm3mh_packa_cntl_ro, + gemm3mh_packb_cntl_ro, + NULL, + gemm3mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real x real) + gemm3mh_cntl_mm_op_ro + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3mh_kc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_op_bp_ro, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real x real) + gemm3mh_cntl_vl_mm_ro + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_nc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_mm_op_ro, + NULL ); + + // + // Create control tree for A.imag * B.imag. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (imag x imag) + gemm3mh_cntl_op_bp_io + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3mh_mc, + gemm3mh_ukrs, + NULL, + gemm3mh_packa_cntl_io, + gemm3mh_packb_cntl_io, + NULL, + gemm3mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (imag x imag) + gemm3mh_cntl_mm_op_io + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3mh_kc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_op_bp_io, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (imag x imag) + gemm3mh_cntl_vl_mm_io + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_nc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_mm_op_io, + NULL ); + + // + // Create control tree for (A.real + A.imag) * (B.real + B.imag). + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real+imag x real+imag) + gemm3mh_cntl_op_bp_rpi + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3mh_mc, + gemm3mh_ukrs, + NULL, + gemm3mh_packa_cntl_rpi, + gemm3mh_packb_cntl_rpi, + NULL, + gemm3mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real+imag x real+imag) + gemm3mh_cntl_mm_op_rpi + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3mh_kc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_op_bp_rpi, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real+imag x real+imag) + gemm3mh_cntl_vl_mm_rpi + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_nc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_mm_op_rpi, + NULL ); + + // Alias the "master" gemm control tree to a shorter name. + gemm3mh_cntl_ro = gemm3mh_cntl_vl_mm_ro; + gemm3mh_cntl_io = gemm3mh_cntl_vl_mm_io; + gemm3mh_cntl_rpi = gemm3mh_cntl_vl_mm_rpi; + +} + +void bli_gemm3mh_cntl_finalize() +{ + bli_blksz_obj_free( gemm3mh_mc ); + bli_blksz_obj_free( gemm3mh_nc ); + bli_blksz_obj_free( gemm3mh_kc ); + bli_blksz_obj_free( gemm3mh_mr ); + bli_blksz_obj_free( gemm3mh_nr ); + bli_blksz_obj_free( gemm3mh_kr ); + + bli_func_obj_free( gemm3mh_ukrs ); + + bli_cntl_obj_free( gemm3mh_packa_cntl_ro ); + bli_cntl_obj_free( gemm3mh_packb_cntl_ro ); + bli_cntl_obj_free( gemm3mh_packa_cntl_io ); + bli_cntl_obj_free( gemm3mh_packb_cntl_io ); + bli_cntl_obj_free( gemm3mh_packa_cntl_rpi ); + bli_cntl_obj_free( gemm3mh_packb_cntl_rpi ); + + bli_cntl_obj_free( gemm3mh_cntl_bp_ke ); + bli_cntl_obj_free( gemm3mh_cntl_op_bp_ro ); + bli_cntl_obj_free( gemm3mh_cntl_mm_op_ro ); + bli_cntl_obj_free( gemm3mh_cntl_vl_mm_ro ); + bli_cntl_obj_free( gemm3mh_cntl_op_bp_io ); + bli_cntl_obj_free( gemm3mh_cntl_mm_op_io ); + bli_cntl_obj_free( gemm3mh_cntl_vl_mm_io ); + bli_cntl_obj_free( gemm3mh_cntl_op_bp_rpi ); + bli_cntl_obj_free( gemm3mh_cntl_mm_op_rpi ); + bli_cntl_obj_free( gemm3mh_cntl_vl_mm_rpi ); + +} + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_cntl.h b/frame/3/gemm/3mh/bli_gemm3mh_cntl.h new file mode 100644 index 000000000..0d3fc6d49 --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm3mh_cntl_init( void ); +void bli_gemm3mh_cntl_finalize( void ); + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_entry.c b/frame/3/gemm/3mh/bli_gemm3mh_entry.c new file mode 100644 index 000000000..3ae00de3d --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_entry.c @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_gemm3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_gemm_front( alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_entry.h b/frame/3/gemm/3mh/bli_gemm3mh_entry.h new file mode 100644 index 000000000..9c200db67 --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c new file mode 100644 index 000000000..3d74f2234 --- /dev/null +++ b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c @@ -0,0 +1,278 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ct[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + ctype_r* restrict a_cast = ( ctype_r* )a; \ +\ + ctype_r* restrict b_cast = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + const pack_t schema = bli_auxinfo_schema_a( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incct, ldct; \ +\ + dim_t i, j; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 3mh method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ + { \ + rs_ct = n; n_iter = m; incc = cs_c; \ + cs_ct = 1; n_elem = n; ldc = rs_c; \ + } \ + else /* column-stored or general stride */ \ + { \ + rs_ct = 1; n_iter = n; incc = rs_c; \ + cs_ct = m; n_elem = m; ldc = cs_c; \ + } \ + incct = 1; \ + ldct = n_elem; \ +\ +\ + /* The following gemm micro-kernel call implements one "phase" of the + 3m method: + + c = beta * c; + c_r += + a_r * b_r - a_i * b_i; + c_i += (a_r + a_i)(b_r + b_i) - a_r * b_r - a_i * b_i; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ +\ + /* ct = alpha_r * a * b; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_cast, \ + b_cast, \ + zero_r, \ + ct, rs_ct, cs_ct, \ + data ); \ +\ +\ + /* How we accumulate the intermediate matrix product stored in ct + depends on (a) the schemas of A and B (they are always the same), + and (b) the value of beta. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ + { \ + /* c = beta * c; + c_r = c_r + ct; + c_i = c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(ch,xpbyris)( gamma11t, \ + -gamma11t, \ + beta_r, \ + beta_i, \ + *gamma11_r, \ + *gamma11_i ); \ + } \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ct; + c_i = c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,subs)( gamma11t, *gamma11_i ); \ + } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ct; + c_i = beta_r * c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t, beta_r, *gamma11_r ); \ + PASTEMAC(chr,xpbys)( -gamma11t, beta_r, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = ct; + c_i = -ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_i ); \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r - ct; + c_i = c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,subs)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,subs)( gamma11t, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = -ct; + c_i = -ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_r ); \ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + 0; + c_i = c_i + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = 0; + c_i = ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,set0s)( *gamma11_r ); \ + PASTEMAC(chr,copys)( gamma11t, *gamma11_i ); \ + } \ + } \ + } \ +\ +\ +/*PASTEMAC(chr,fprintm)( stdout, "gemm3mh_ukr: b1", k, n, b_cast, n, 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemm3mh_ukr: a1", m, k, a_cast, 1, m, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNCCO_BASIC( gemm3mh_ukr_ref, GEMM_UKERNEL ) + diff --git a/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h new file mode 100644 index 000000000..5b34fff0d --- /dev/null +++ b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( gemm3mh_ukr_ref ) + diff --git a/frame/3/gemm/4m/bli_gemm4m_cntl.c b/frame/3/gemm/4m/bli_gemm4m_cntl.c index 630a55753..ab0943221 100644 --- a/frame/3/gemm/4m/bli_gemm4m_cntl.c +++ b/frame/3/gemm/4m/bli_gemm4m_cntl.c @@ -67,38 +67,38 @@ void bli_gemm4m_cntl_init() // parts), we reduce KC by a factor of 2 to compensate. gemm4m_mc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4m_nc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4m_kc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm4m_mr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4m_nr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4m_kr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); diff --git a/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c index e4c171c2f..078cc2a77 100644 --- a/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c +++ b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c @@ -53,9 +53,8 @@ void PASTEMAC(ch,varname)( \ ctype_r ct_i[ PASTEMAC(chr,mr) * \ PASTEMAC(chr,nr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = 1; \ - const inc_t cs_ct = PASTEMAC(chr,mr); \ -\ + inc_t rs_ct; \ + inc_t cs_ct; \ \ const dim_t m = PASTEMAC(chr,mr); \ const dim_t n = PASTEMAC(chr,nr); \ @@ -68,23 +67,26 @@ void PASTEMAC(ch,varname)( \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \ -\ - ctype_r* restrict c_r = ( ctype_r* )c; \ - ctype_r* restrict c_i = ( ctype_r* )c + 1; \ -\ - const inc_t rs_c2 = 2 * rs_c; \ - const inc_t cs_c2 = 2 * cs_c; \ \ ctype_r* restrict one_r = PASTEMAC(chr,1); \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ \ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ - ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \ + ctype_r m_alpha_r = -(*alpha_r); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ \ void* a_next = bli_auxinfo_next_a( data ); \ void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incct, ldct; \ \ dim_t i, j; \ \ @@ -93,20 +95,43 @@ void PASTEMAC(ch,varname)( \ allow an alpha with non-zero imaginary component to be passed in, because it can't be applied properly using the 4m method. If alpha is not real, then something is very wrong. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ \ - /* c.r = beta.r * c.r + alpha.r * a.r * b.r - - alpha.r * a.i * b.i; - c.i = beta.r * c.i + alpha.r * a.r * b.i - + alpha.r * a.i * b.r; */ \ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ + { \ + rs_ct = n; n_iter = m; incc = cs_c; \ + cs_ct = 1; n_elem = n; ldc = rs_c; \ + } \ + else /* column-stored or general stride */ \ + { \ + rs_ct = 1; n_iter = n; incc = rs_c; \ + cs_ct = m; n_elem = m; ldc = cs_c; \ + } \ + incct = 1; \ + ldct = n_elem; \ +\ +\ + /* The following gemm micro-kernel calls implement all "phases" of + the 4m method: + + c = beta * c; + c_r += a_r * b_r - a_i * b_i; + c_i += a_r * b_i + a_i * b_r; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ \ bli_auxinfo_set_next_ab( a_r, b_i, *data ); \ \ - /* ct.r = alpha.r * a.r * b.r; */ \ + /* ct_r = alpha_r * a_r * b_r; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_r, \ b_r, \ zero_r, \ @@ -115,9 +140,9 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_i, b_r, *data ); \ \ - /* ct.i = alpha.r * a.r * b.i; */ \ + /* ct_i = alpha_r * a_r * b_i; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_r, \ b_i, \ zero_r, \ @@ -126,9 +151,9 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ \ - /* ct.i += alpha.r * a.i * b.r; */ \ + /* ct_i += alpha_r * a_i * b_r; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_i, \ b_r, \ one_r, \ @@ -137,7 +162,7 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ \ - /* ct.r += -alpha.r * a.i * b.i; */ \ + /* ct_r += -alpha_r * a_i * b_i; */ \ PASTEMAC(chr,gemmukr)( k, \ &m_alpha_r, \ a_i, \ @@ -147,38 +172,78 @@ void PASTEMAC(ch,varname)( \ data ); \ \ \ - /* Accumulate the final result in ct back to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ + /* How we accumulate the intermediate matrix product stored in ct_r + and ct_i depends on the value of beta. */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,addris)( *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct), \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct), \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ - } \ - else \ - { \ - ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ + /* c = beta * c + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,xpbyris)( *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct), \ - beta_r, \ - beta_i, \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ + PASTEMAC(ch,xpbyris)( gamma11t_r, \ + gamma11t_i, \ + beta_r, \ + beta_i, \ + *gamma11_r, \ + *gamma11_i ); \ + } \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ct_r; */ \ + /* c_i = c_i + ct_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t_r, *gamma11_r ); \ + PASTEMAC(chr,adds)( gamma11t_i, *gamma11_i ); \ + } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ct_r; */ \ + /* c_i = beta_r * c_i + ct_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t_r, beta_r, *gamma11_r ); \ + PASTEMAC(chr,xpbys)( gamma11t_i, beta_r, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = ct_r; */ \ + /* c_i = ct_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( gamma11t_r, *gamma11_r ); \ + PASTEMAC(chr,copys)( gamma11t_i, *gamma11_i ); \ + } \ } \ } diff --git a/frame/3/gemm/4mh/bli_gemm4mh.c b/frame/3/gemm/4mh/bli_gemm4mh.c new file mode 100644 index 000000000..c5d2b3fcb --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh.c @@ -0,0 +1,101 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_gemm4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_gemm4mh_entry( alpha, a, b, beta, c ); + else + bli_gemm_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( gemm4mh, gemm4mh ) + diff --git a/frame/3/gemm/4mh/bli_gemm4mh.h b/frame/3/gemm/4mh/bli_gemm4mh.h new file mode 100644 index 000000000..54fb0ece0 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_gemm4mh_cntl.h" +#include "bli_gemm4mh_entry.h" + +#include "bli_gemm4mh_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_gemm4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemm4mh ) + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_cntl.c b/frame/3/gemm/4mh/bli_gemm4mh_cntl.c new file mode 100644 index 000000000..a29a0292c --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_cntl.c @@ -0,0 +1,431 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +blksz_t* gemm4mh_mc; +blksz_t* gemm4mh_nc; +blksz_t* gemm4mh_kc; +blksz_t* gemm4mh_mr; +blksz_t* gemm4mh_nr; +blksz_t* gemm4mh_kr; + +func_t* gemm4mh_ukrs; + +packm_t* gemm4mh_packa_cntl_ro; +packm_t* gemm4mh_packb_cntl_ro; +packm_t* gemm4mh_packa_cntl_io; +packm_t* gemm4mh_packb_cntl_io; + +gemm_t* gemm4mh_cntl_bp_ke; +gemm_t* gemm4mh_cntl_op_bp_rr; +gemm_t* gemm4mh_cntl_mm_op_rr; +gemm_t* gemm4mh_cntl_vl_mm_rr; +gemm_t* gemm4mh_cntl_op_bp_ri; +gemm_t* gemm4mh_cntl_mm_op_ri; +gemm_t* gemm4mh_cntl_vl_mm_ri; +gemm_t* gemm4mh_cntl_op_bp_ir; +gemm_t* gemm4mh_cntl_mm_op_ir; +gemm_t* gemm4mh_cntl_vl_mm_ir; +gemm_t* gemm4mh_cntl_op_bp_ii; +gemm_t* gemm4mh_cntl_mm_op_ii; +gemm_t* gemm4mh_cntl_vl_mm_ii; + +gemm_t* gemm4mh_cntl_rr; +gemm_t* gemm4mh_cntl_ri; +gemm_t* gemm4mh_cntl_ir; +gemm_t* gemm4mh_cntl_ii; + + +void bli_gemm4mh_cntl_init() +{ + // Create blocksize objects for each dimension. + // NOTE: the complex blocksizes for 4mh are equal to their + // corresponding real domain counterparts. + gemm4mh_mc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, + BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); + gemm4mh_nc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, + BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); + gemm4mh_kc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, + BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); + gemm4mh_mr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, + BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); + gemm4mh_nr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, + BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); + gemm4mh_kr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, + BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); + + + // Attach the register blksz_t objects as sub-blocksizes to the cache + // blksz_t objects. + bli_blksz_obj_attach_to( gemm4mh_mr, gemm4mh_mc ); + bli_blksz_obj_attach_to( gemm4mh_nr, gemm4mh_nc ); + bli_blksz_obj_attach_to( gemm4mh_kr, gemm4mh_kc ); + + + // Create function pointer object for each datatype-specific gemm + // micro-kernel. + gemm4mh_ukrs + = + bli_func_obj_create( + NULL, FALSE, + NULL, FALSE, + BLIS_CGEMM4MH_UKERNEL, BLIS_CGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_ZGEMM4MH_UKERNEL, BLIS_ZGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS ); + + + // Create control tree objects for packm operations (real only). + gemm4mh_packa_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_mr, + gemm4mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_RO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm4mh_packb_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_kr, + gemm4mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_RO, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (imag only). + gemm4mh_packa_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_mr, + gemm4mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_IO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm4mh_packb_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_kr, + gemm4mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_IO, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + gemm4mh_cntl_bp_ke + = + bli_gemm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm4mh_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // + // Create control tree for A.real * B.real. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real x real) + gemm4mh_cntl_op_bp_rr + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_ro, + gemm4mh_packb_cntl_ro, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real x real) + gemm4mh_cntl_mm_op_rr + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_rr, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real x real) + gemm4mh_cntl_vl_mm_rr + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_rr, + NULL ); + + // + // Create control tree for A.real * B.imag. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real x imag) + gemm4mh_cntl_op_bp_ri + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_ro, + gemm4mh_packb_cntl_io, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real x imag) + gemm4mh_cntl_mm_op_ri + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_ri, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real x imag) + gemm4mh_cntl_vl_mm_ri + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_ri, + NULL ); + + // + // Create control tree for A.imag * B.real. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (imag x real) + gemm4mh_cntl_op_bp_ir + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_io, + gemm4mh_packb_cntl_ro, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (imag x real) + gemm4mh_cntl_mm_op_ir + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_ir, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (imag x real) + gemm4mh_cntl_vl_mm_ir + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_ir, + NULL ); + + // + // Create control tree for A.imag * B.imag. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (imag x imag) + gemm4mh_cntl_op_bp_ii + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_io, + gemm4mh_packb_cntl_io, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (imag x imag) + gemm4mh_cntl_mm_op_ii + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_ii, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (imag x imag) + gemm4mh_cntl_vl_mm_ii + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_ii, + NULL ); + + + // Alias the "master" gemm control tree to a shorter name. + gemm4mh_cntl_rr = gemm4mh_cntl_vl_mm_rr; + gemm4mh_cntl_ri = gemm4mh_cntl_vl_mm_ri; + gemm4mh_cntl_ir = gemm4mh_cntl_vl_mm_ir; + gemm4mh_cntl_ii = gemm4mh_cntl_vl_mm_ii; + +} + +void bli_gemm4mh_cntl_finalize() +{ + bli_blksz_obj_free( gemm4mh_mc ); + bli_blksz_obj_free( gemm4mh_nc ); + bli_blksz_obj_free( gemm4mh_kc ); + bli_blksz_obj_free( gemm4mh_mr ); + bli_blksz_obj_free( gemm4mh_nr ); + bli_blksz_obj_free( gemm4mh_kr ); + + bli_func_obj_free( gemm4mh_ukrs ); + + bli_cntl_obj_free( gemm4mh_packa_cntl_ro ); + bli_cntl_obj_free( gemm4mh_packb_cntl_ro ); + bli_cntl_obj_free( gemm4mh_packa_cntl_io ); + bli_cntl_obj_free( gemm4mh_packb_cntl_io ); + + bli_cntl_obj_free( gemm4mh_cntl_bp_ke ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_rr ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_rr ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_rr ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_ri ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_ri ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_ri ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_ir ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_ir ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_ir ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_ii ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_ii ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_ii ); + +} + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_cntl.h b/frame/3/gemm/4mh/bli_gemm4mh_cntl.h new file mode 100644 index 000000000..2ced05dd9 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm4mh_cntl_init( void ); +void bli_gemm4mh_cntl_finalize( void ); + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_entry.c b/frame/3/gemm/4mh/bli_gemm4mh_entry.c new file mode 100644 index 000000000..e3bf76e13 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_entry.c @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ii; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; + +void bli_gemm4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_gemm_front( alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_entry.h b/frame/3/gemm/4mh/bli_gemm4mh_entry.h new file mode 100644 index 000000000..904b7c7b7 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c new file mode 100644 index 000000000..ee6b9066b --- /dev/null +++ b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c @@ -0,0 +1,271 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ct[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + ctype_r* restrict a_cast = ( ctype_r* )a; \ +\ + ctype_r* restrict b_cast = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + const pack_t schema_a = bli_auxinfo_schema_a( data ); \ + const pack_t schema_b = bli_auxinfo_schema_b( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incct, ldct; \ +\ + dim_t i, j; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 4mh method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ + { \ + rs_ct = n; n_iter = m; incc = cs_c; \ + cs_ct = 1; n_elem = n; ldc = rs_c; \ + } \ + else /* column-stored or general stride */ \ + { \ + rs_ct = 1; n_iter = n; incc = rs_c; \ + cs_ct = m; n_elem = m; ldc = cs_c; \ + } \ + incct = 1; \ + ldct = n_elem; \ +\ +\ + /* The following gemm micro-kernel call implement one "phase" of the + 4m method: + + c = beta * c; + c_r += a_r * b_r - a_i * b_i; + c_i += a_r * b_i + a_i * b_r; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ +\ + /* ct = alpha_r * a * b; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_cast, \ + b_cast, \ + zero_r, \ + ct, rs_ct, cs_ct, \ + data ); \ +\ +\ + /* How we accumulate the intermediate matrix product stored in ct + depends on (a) the schemas of A and B, and (b) the value of + beta. */ \ + if ( bli_is_ro_packed( schema_a ) && \ + bli_is_ro_packed( schema_b ) ) \ + { \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ + { \ + /* c = beta * c; + c_r = c_r + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ +\ + PASTEMAC(ch,scals)( *beta, *gamma11 ); \ + PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \ + } \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ct; + c_i = c_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \ + } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ct; + c_i = beta_r * c_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t, beta_r, *gamma11_r ); \ + PASTEMAC(chr,scals)( beta_r, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = ct; + c_i = 0; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,set0s)( *gamma11_i ); \ + } \ + } \ + } \ + else if ( ( bli_is_ro_packed( schema_a ) && \ + bli_is_io_packed( schema_b ) ) || \ + ( bli_is_io_packed( schema_a ) && \ + bli_is_ro_packed( schema_b ) ) \ + ) \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + 0; + c_i = c_i + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = 0; + c_i = ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,set0s)( *gamma11_r ); \ + PASTEMAC(chr,copys)( gamma11t, *gamma11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_io_packed( schema_a ) && \ + bli_is_io_packed( schema_b ) ) */ \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r - ct; + c_i = c_i + 0; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ +\ + PASTEMAC(chr,subs)( gamma11t, *gamma11_r ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = -ct; + c_i = 0; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_r ); \ + PASTEMAC(chr,set0s)( *gamma11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm4mh_ukr_ref, GEMM_UKERNEL ) + diff --git a/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h new file mode 100644 index 000000000..71000ef23 --- /dev/null +++ b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( gemm4mh_ukr_ref ) + diff --git a/frame/3/gemm/bli_gemm.c b/frame/3/gemm/bli_gemm.c index 6f6c8f7d1..3939f9ea2 100644 --- a/frame/3/gemm/bli_gemm.c +++ b/frame/3/gemm/bli_gemm.c @@ -43,12 +43,16 @@ void bli_gemm( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_gemm4m_entry( alpha, a, b, beta, c ); - else - bli_gemm_entry( alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_gemm3mh_entry( alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_gemm3m_entry( alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_gemm4mh_entry( alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_gemm4m_entry( alpha, a, b, beta, c ); + else bli_gemm_entry( alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index a39e6dbab..d66488ce7 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -33,6 +33,7 @@ */ #include "bli_gemm_cntl.h" +#include "bli_gemm_query.h" #include "bli_gemm_check.h" #include "bli_gemm_entry.h" #include "bli_gemm_front.h" @@ -50,6 +51,8 @@ #include "bli_gemm4m.h" #include "bli_gemm3m.h" +#include "bli_gemm4mh.h" +#include "bli_gemm3mh.h" // // Prototype object-based interface. diff --git a/frame/3/gemm/bli_gemm_query.c b/frame/3/gemm/bli_gemm_query.c new file mode 100644 index 000000000..256751c06 --- /dev/null +++ b/frame/3/gemm/bli_gemm_query.c @@ -0,0 +1,86 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern func_t* gemm3mh_ukrs; +extern func_t* gemm3m_ukrs; +extern func_t* gemm4mh_ukrs; +extern func_t* gemm4m_ukrs; +extern func_t* gemm_ukrs; + +func_t* bli_gemm_query_ukrs( num_t dt ) +{ + if ( bli_3mh_is_enabled_dt( dt ) ) return gemm3mh_ukrs; + else if ( bli_3m_is_enabled_dt( dt ) ) return gemm3m_ukrs; + else if ( bli_4mh_is_enabled_dt( dt ) ) return gemm4mh_ukrs; + else if ( bli_4m_is_enabled_dt( dt ) ) return gemm4m_ukrs; + else return gemm_ukrs; +} + +char* bli_gemm_query_impl_string( num_t dt ) +{ + if ( bli_3mh_is_enabled_dt( dt ) ) return bli_3mh_get_string(); + else if ( bli_3m_is_enabled_dt( dt ) ) return bli_3m_get_string(); + else if ( bli_4mh_is_enabled_dt( dt ) ) return bli_4mh_get_string(); + else if ( bli_4m_is_enabled_dt( dt ) ) return bli_4m_get_string(); + else return bli_native_get_string(); +} + +kimpl_t bli_gemm_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_gemm_query_ukrs( dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_SGEMM_UKERNEL_REF || + p == BLIS_DGEMM_UKERNEL_REF || + p == BLIS_CGEMM_UKERNEL_REF || + p == BLIS_ZGEMM_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CGEMM3MH_UKERNEL_REF || + p == BLIS_ZGEMM3MH_UKERNEL_REF || + p == BLIS_CGEMM3M_UKERNEL_REF || + p == BLIS_ZGEMM3M_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CGEMM4MH_UKERNEL_REF || + p == BLIS_ZGEMM4MH_UKERNEL_REF || + p == BLIS_CGEMM4M_UKERNEL_REF || + p == BLIS_ZGEMM4M_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + diff --git a/frame/3/gemm/bli_gemm_query.h b/frame/3/gemm/bli_gemm_query.h new file mode 100644 index 000000000..5466c5938 --- /dev/null +++ b/frame/3/gemm/bli_gemm_query.h @@ -0,0 +1,38 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +func_t* bli_gemm_query_ukrs( num_t dt ); +char* bli_gemm_query_impl_string( num_t dt ); + +kimpl_t bli_gemm_ukernel_impl_type( num_t dt ); diff --git a/frame/3/hemm/3mh/bli_hemm3mh.c b/frame/3/hemm/3mh/bli_hemm3mh.c new file mode 100644 index 000000000..337ab28d9 --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_hemm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_hemm3mh_entry( side, alpha, a, b, beta, c ); + else + bli_hemm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( hemm3mh, hemm3mh ) + diff --git a/frame/3/hemm/3mh/bli_hemm3mh.h b/frame/3/hemm/3mh/bli_hemm3mh.h new file mode 100644 index 000000000..60168cfb0 --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_hemm3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_hemm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( hemm3mh ) + diff --git a/frame/3/hemm/3mh/bli_hemm3mh_entry.c b/frame/3/hemm/3mh/bli_hemm3mh_entry.c new file mode 100644 index 000000000..2444e6b3f --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_hemm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_hemm_front( side, alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/hemm/3mh/bli_hemm3mh_entry.h b/frame/3/hemm/3mh/bli_hemm3mh_entry.h new file mode 100644 index 000000000..08cb026a1 --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_hemm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/hemm/4mh/bli_hemm4mh.c b/frame/3/hemm/4mh/bli_hemm4mh.c new file mode 100644 index 000000000..4aec7b157 --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_hemm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_hemm4mh_entry( side, alpha, a, b, beta, c ); + else + bli_hemm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( hemm4mh, hemm4mh ) + diff --git a/frame/3/hemm/4mh/bli_hemm4mh.h b/frame/3/hemm/4mh/bli_hemm4mh.h new file mode 100644 index 000000000..e055ee347 --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_hemm4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_hemm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( hemm4mh ) + diff --git a/frame/3/hemm/4mh/bli_hemm4mh_entry.c b/frame/3/hemm/4mh/bli_hemm4mh_entry.c new file mode 100644 index 000000000..f9545ca85 --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh_entry.c @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_hemm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_hemm_front( side, alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/hemm/4mh/bli_hemm4mh_entry.h b/frame/3/hemm/4mh/bli_hemm4mh_entry.h new file mode 100644 index 000000000..0f5ee375f --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_hemm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/hemm/bli_hemm.c b/frame/3/hemm/bli_hemm.c index 212912fad..c1c62a0b8 100644 --- a/frame/3/hemm/bli_hemm.c +++ b/frame/3/hemm/bli_hemm.c @@ -44,12 +44,16 @@ void bli_hemm( side_t side, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_hemm4m_entry( side, alpha, a, b, beta, c ); - else - bli_hemm_entry( side, alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_hemm3mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_hemm3m_entry( side, alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_hemm4mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_hemm4m_entry( side, alpha, a, b, beta, c ); + else bli_hemm_entry( side, alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/hemm/bli_hemm.h b/frame/3/hemm/bli_hemm.h index 59f655684..3dba760a9 100644 --- a/frame/3/hemm/bli_hemm.h +++ b/frame/3/hemm/bli_hemm.h @@ -38,6 +38,8 @@ #include "bli_hemm4m.h" #include "bli_hemm3m.h" +#include "bli_hemm4mh.h" +#include "bli_hemm3mh.h" // @@ -50,6 +52,7 @@ void bli_hemm( side_t side, obj_t* beta, obj_t* c ); + // // Prototype BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/her2k/3mh/bli_her2k3mh.c b/frame/3/her2k/3mh/bli_her2k3mh.c new file mode 100644 index 000000000..c4a501c8c --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_her2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_her2k3mh_entry( alpha, a, b, beta, c ); + else + bli_her2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( her2k3mh, her2k3mh ) + diff --git a/frame/3/her2k/3mh/bli_her2k3mh.h b/frame/3/her2k/3mh/bli_her2k3mh.h new file mode 100644 index 000000000..123dc5847 --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_her2k3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_her2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( her2k3mh ) + diff --git a/frame/3/her2k/3mh/bli_her2k3mh_entry.c b/frame/3/her2k/3mh/bli_her2k3mh_entry.c new file mode 100644 index 000000000..e95ad844d --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh_entry.c @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_her2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_her2k_front( alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/her2k/3mh/bli_her2k3mh_entry.h b/frame/3/her2k/3mh/bli_her2k3mh_entry.h new file mode 100644 index 000000000..699705637 --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_her2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/her2k/4mh/bli_her2k4mh.c b/frame/3/her2k/4mh/bli_her2k4mh.c new file mode 100644 index 000000000..18e4f33e3 --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_her2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_her2k4mh_entry( alpha, a, b, beta, c ); + else + bli_her2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( her2k4mh, her2k4mh ) + diff --git a/frame/3/her2k/4mh/bli_her2k4mh.h b/frame/3/her2k/4mh/bli_her2k4mh.h new file mode 100644 index 000000000..fa86a85bd --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_her2k4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_her2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( her2k4mh ) + diff --git a/frame/3/her2k/4mh/bli_her2k4mh_entry.c b/frame/3/her2k/4mh/bli_her2k4mh_entry.c new file mode 100644 index 000000000..a122c4c6f --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh_entry.c @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_her2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_her2k_front( alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/her2k/4mh/bli_her2k4mh_entry.h b/frame/3/her2k/4mh/bli_her2k4mh_entry.h new file mode 100644 index 000000000..706150811 --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_her2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/her2k/bli_her2k.c b/frame/3/her2k/bli_her2k.c index 33f685495..74e1613df 100644 --- a/frame/3/her2k/bli_her2k.c +++ b/frame/3/her2k/bli_her2k.c @@ -43,12 +43,16 @@ void bli_her2k( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_her2k4m_entry( alpha, a, b, beta, c ); - else - bli_her2k_entry( alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_her2k3mh_entry( alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_her2k3m_entry( alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_her2k4mh_entry( alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_her2k4m_entry( alpha, a, b, beta, c ); + else bli_her2k_entry( alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h index 2f41d7e48..a1ffb8e51 100644 --- a/frame/3/her2k/bli_her2k.h +++ b/frame/3/her2k/bli_her2k.h @@ -52,6 +52,8 @@ #include "bli_her2k4m.h" #include "bli_her2k3m.h" +#include "bli_her2k4mh.h" +#include "bli_her2k3mh.h" // diff --git a/frame/3/herk/3mh/bli_herk3mh.c b/frame/3/herk/3mh/bli_herk3mh.c new file mode 100644 index 000000000..b3b7ae2db --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh.c @@ -0,0 +1,97 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_herk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_herk3mh_entry( alpha, a, beta, c ); + else + bli_herk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( herk3mh, herk3mh ) + diff --git a/frame/3/herk/3mh/bli_herk3mh.h b/frame/3/herk/3mh/bli_herk3mh.h new file mode 100644 index 000000000..b748db2c0 --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_herk3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_herk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( herk3mh ) + diff --git a/frame/3/herk/3mh/bli_herk3mh_entry.c b/frame/3/herk/3mh/bli_herk3mh_entry.c new file mode 100644 index 000000000..5e8be3635 --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh_entry.c @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_herk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_herk_front( alpha, a, beta, c, gemm3mh_cntl_ro ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/herk/3mh/bli_herk3mh_entry.h b/frame/3/herk/3mh/bli_herk3mh_entry.h new file mode 100644 index 000000000..7ad1e67b4 --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_herk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/herk/4mh/bli_herk4mh.c b/frame/3/herk/4mh/bli_herk4mh.c new file mode 100644 index 000000000..7288881cd --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh.c @@ -0,0 +1,97 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_herk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_herk4mh_entry( alpha, a, beta, c ); + else + bli_herk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( herk4mh, herk4mh ) + diff --git a/frame/3/herk/4mh/bli_herk4mh.h b/frame/3/herk/4mh/bli_herk4mh.h new file mode 100644 index 000000000..aeff510dc --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_herk4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_herk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( herk4mh ) + diff --git a/frame/3/herk/4mh/bli_herk4mh_entry.c b/frame/3/herk/4mh/bli_herk4mh_entry.c new file mode 100644 index 000000000..19009f715 --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_herk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_herk_front( alpha, a, beta, c, gemm4mh_cntl_rr ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/herk/4mh/bli_herk4mh_entry.h b/frame/3/herk/4mh/bli_herk4mh_entry.h new file mode 100644 index 000000000..a75a501eb --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_herk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/herk/bli_herk.c b/frame/3/herk/bli_herk.c index 9cd5c93cb..a56ff3971 100644 --- a/frame/3/herk/bli_herk.c +++ b/frame/3/herk/bli_herk.c @@ -42,12 +42,16 @@ void bli_herk( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_herk4m_entry( alpha, a, beta, c ); - else - bli_herk_entry( alpha, a, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_herk3mh_entry( alpha, a, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_herk3m_entry( alpha, a, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_herk4mh_entry( alpha, a, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_herk4m_entry( alpha, a, beta, c ); + else bli_herk_entry( alpha, a, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h index 4a81b2b9e..8c779fe91 100644 --- a/frame/3/herk/bli_herk.h +++ b/frame/3/herk/bli_herk.h @@ -48,6 +48,8 @@ #include "bli_herk4m.h" #include "bli_herk3m.h" +#include "bli_herk4mh.h" +#include "bli_herk3mh.h" // diff --git a/frame/3/symm/3mh/bli_symm3mh.c b/frame/3/symm/3mh/bli_symm3mh.c new file mode 100644 index 000000000..c79f367c0 --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_symm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_symm3mh_entry( side, alpha, a, b, beta, c ); + else + bli_symm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( symm3mh, symm3mh ) + diff --git a/frame/3/symm/3mh/bli_symm3mh.h b/frame/3/symm/3mh/bli_symm3mh.h new file mode 100644 index 000000000..d353a8651 --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_symm3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_symm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( symm3mh ) + diff --git a/frame/3/symm/3mh/bli_symm3mh_entry.c b/frame/3/symm/3mh/bli_symm3mh_entry.c new file mode 100644 index 000000000..1277cbf8f --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_symm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_symm_front( side, alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/symm/3mh/bli_symm3mh_entry.h b/frame/3/symm/3mh/bli_symm3mh_entry.h new file mode 100644 index 000000000..1030c3e9f --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_symm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/symm/4mh/bli_symm4mh.c b/frame/3/symm/4mh/bli_symm4mh.c new file mode 100644 index 000000000..ac62aeadc --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_symm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_symm4mh_entry( side, alpha, a, b, beta, c ); + else + bli_symm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( symm4mh, symm4mh ) + diff --git a/frame/3/symm/4mh/bli_symm4mh.h b/frame/3/symm/4mh/bli_symm4mh.h new file mode 100644 index 000000000..e52ba4079 --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_symm4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_symm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( symm4mh ) + diff --git a/frame/3/symm/4mh/bli_symm4mh_entry.c b/frame/3/symm/4mh/bli_symm4mh_entry.c new file mode 100644 index 000000000..e2322e1d0 --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh_entry.c @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_symm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_symm_front( side, alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/symm/4mh/bli_symm4mh_entry.h b/frame/3/symm/4mh/bli_symm4mh_entry.h new file mode 100644 index 000000000..05a416627 --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_symm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/symm/bli_symm.c b/frame/3/symm/bli_symm.c index 459b69cc3..d01232f65 100644 --- a/frame/3/symm/bli_symm.c +++ b/frame/3/symm/bli_symm.c @@ -44,12 +44,16 @@ void bli_symm( side_t side, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_symm4m_entry( side, alpha, a, b, beta, c ); - else - bli_symm_entry( side, alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_symm3mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_symm3m_entry( side, alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_symm4mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_symm4m_entry( side, alpha, a, b, beta, c ); + else bli_symm_entry( side, alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/symm/bli_symm.h b/frame/3/symm/bli_symm.h index 85b87b0b0..f9bddfa2f 100644 --- a/frame/3/symm/bli_symm.h +++ b/frame/3/symm/bli_symm.h @@ -38,6 +38,8 @@ #include "bli_symm4m.h" #include "bli_symm3m.h" +#include "bli_symm4mh.h" +#include "bli_symm3mh.h" // @@ -50,6 +52,7 @@ void bli_symm( side_t side, obj_t* beta, obj_t* c ); + // // Prototype BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh.c b/frame/3/syr2k/3mh/bli_syr2k3mh.c new file mode 100644 index 000000000..0fa00e953 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh.c @@ -0,0 +1,104 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syr2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syr2k3mh_entry( alpha, a, b, beta, c ); + else + bli_syr2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syr2k3mh, syr2k3mh ) + diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh.h b/frame/3/syr2k/3mh/bli_syr2k3mh.h new file mode 100644 index 000000000..e14739094 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syr2k3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syr2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syr2k3mh ) + diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh_entry.c b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.c new file mode 100644 index 000000000..74d5e24a0 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.c @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_syr2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_syr2k_front( alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh_entry.h b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.h new file mode 100644 index 000000000..55f828542 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syr2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh.c b/frame/3/syr2k/4mh/bli_syr2k4mh.c new file mode 100644 index 000000000..7fc5410d6 --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh.c @@ -0,0 +1,104 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syr2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syr2k4mh_entry( alpha, a, b, beta, c ); + else + bli_syr2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syr2k4mh, syr2k4mh ) + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh.h b/frame/3/syr2k/4mh/bli_syr2k4mh.h new file mode 100644 index 000000000..2b0cfa4cb --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syr2k4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syr2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syr2k4mh ) + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh_entry.c b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.c new file mode 100644 index 000000000..58218174f --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.c @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_syr2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_syr2k_front( alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh_entry.h b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.h new file mode 100644 index 000000000..6e0e4cc9b --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syr2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syr2k/bli_syr2k.c b/frame/3/syr2k/bli_syr2k.c index d56a98a5c..9fbc9d7a7 100644 --- a/frame/3/syr2k/bli_syr2k.c +++ b/frame/3/syr2k/bli_syr2k.c @@ -43,12 +43,16 @@ void bli_syr2k( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_syr2k4m_entry( alpha, a, b, beta, c ); - else - bli_syr2k_entry( alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_syr2k3mh_entry( alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_syr2k3m_entry( alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_syr2k4mh_entry( alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_syr2k4m_entry( alpha, a, b, beta, c ); + else bli_syr2k_entry( alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h index 313164041..99c16dce7 100644 --- a/frame/3/syr2k/bli_syr2k.h +++ b/frame/3/syr2k/bli_syr2k.h @@ -38,6 +38,8 @@ #include "bli_syr2k4m.h" #include "bli_syr2k3m.h" +#include "bli_syr2k4mh.h" +#include "bli_syr2k3mh.h" // diff --git a/frame/3/syrk/3mh/bli_syrk3mh.c b/frame/3/syrk/3mh/bli_syrk3mh.c new file mode 100644 index 000000000..2822a732a --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh.c @@ -0,0 +1,96 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syrk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syrk3mh_entry( alpha, a, beta, c ); + else + bli_syrk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syrk3mh, syrk3mh ) + diff --git a/frame/3/syrk/3mh/bli_syrk3mh.h b/frame/3/syrk/3mh/bli_syrk3mh.h new file mode 100644 index 000000000..c25d02926 --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syrk3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syrk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syrk3mh ) + diff --git a/frame/3/syrk/3mh/bli_syrk3mh_entry.c b/frame/3/syrk/3mh/bli_syrk3mh_entry.c new file mode 100644 index 000000000..8f1e46143 --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh_entry.c @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_syrk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_syrk_front( alpha, a, beta, c, gemm3mh_cntl_ro ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/syrk/3mh/bli_syrk3mh_entry.h b/frame/3/syrk/3mh/bli_syrk3mh_entry.h new file mode 100644 index 000000000..f6b3c5e4d --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syrk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syrk/4mh/bli_syrk4mh.c b/frame/3/syrk/4mh/bli_syrk4mh.c new file mode 100644 index 000000000..8ff0cbc39 --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh.c @@ -0,0 +1,96 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syrk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syrk4mh_entry( alpha, a, beta, c ); + else + bli_syrk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syrk4mh, syrk4mh ) + diff --git a/frame/3/syrk/4mh/bli_syrk4mh.h b/frame/3/syrk/4mh/bli_syrk4mh.h new file mode 100644 index 000000000..9474d350b --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syrk4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syrk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syrk4mh ) + diff --git a/frame/3/syrk/4mh/bli_syrk4mh_entry.c b/frame/3/syrk/4mh/bli_syrk4mh_entry.c new file mode 100644 index 000000000..d7942604f --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_syrk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_syrk_front( alpha, a, beta, c, gemm4mh_cntl_rr ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/syrk/4mh/bli_syrk4mh_entry.h b/frame/3/syrk/4mh/bli_syrk4mh_entry.h new file mode 100644 index 000000000..1760db027 --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syrk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syrk/bli_syrk.c b/frame/3/syrk/bli_syrk.c index 2a09b430f..69dde1fcb 100644 --- a/frame/3/syrk/bli_syrk.c +++ b/frame/3/syrk/bli_syrk.c @@ -42,12 +42,16 @@ void bli_syrk( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_syrk4m_entry( alpha, a, beta, c ); - else - bli_syrk_entry( alpha, a, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_syrk3mh_entry( alpha, a, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_syrk3m_entry( alpha, a, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_syrk4mh_entry( alpha, a, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_syrk4m_entry( alpha, a, beta, c ); + else bli_syrk_entry( alpha, a, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h index cba072d98..268b21b30 100644 --- a/frame/3/syrk/bli_syrk.h +++ b/frame/3/syrk/bli_syrk.h @@ -38,6 +38,8 @@ #include "bli_syrk4m.h" #include "bli_syrk3m.h" +#include "bli_syrk4mh.h" +#include "bli_syrk3mh.h" // diff --git a/frame/3/trmm/bli_trmm.c b/frame/3/trmm/bli_trmm.c index e178de8e6..7037876bb 100644 --- a/frame/3/trmm/bli_trmm.c +++ b/frame/3/trmm/bli_trmm.c @@ -42,12 +42,14 @@ void bli_trmm( side_t side, obj_t* a, obj_t* b ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *b ) ) ) - bli_trmm4m_entry( side, alpha, a, b ); - else - bli_trmm_entry( side, alpha, a, b ); + num_t dt = bli_obj_datatype( *b ); + + if ( bli_3m_is_enabled_dt( dt ) ) bli_trmm3m_entry( side, alpha, a, b ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_trmm4m_entry( side, alpha, a, b ); + else bli_trmm_entry( side, alpha, a, b ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/trmm/bli_trmm.h b/frame/3/trmm/bli_trmm.h index 886824aa1..320b1d9eb 100644 --- a/frame/3/trmm/bli_trmm.h +++ b/frame/3/trmm/bli_trmm.h @@ -32,6 +32,7 @@ */ +#include "bli_trmm_query.h" #include "bli_trmm_check.h" #include "bli_trmm_entry.h" #include "bli_trmm_front.h" diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 1b6ce80cb..be85ea889 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 3ae763214..83bd70b6d 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm/bli_trmm_query.c b/frame/3/trmm/bli_trmm_query.c new file mode 100644 index 000000000..74b163c64 --- /dev/null +++ b/frame/3/trmm/bli_trmm_query.c @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern func_t* gemm3m_ukrs; +extern func_t* gemm4m_ukrs; +extern func_t* gemm_ukrs; + +func_t* bli_trmm_query_ukrs( num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) return gemm3m_ukrs; + else if ( bli_4m_is_enabled_dt( dt ) ) return gemm4m_ukrs; + else return gemm_ukrs; +} + +char* bli_trmm_query_impl_string( num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) return bli_3m_get_string(); + else if ( bli_4m_is_enabled_dt( dt ) ) return bli_4m_get_string(); + else return bli_native_get_string(); +} + + diff --git a/frame/3/trmm/bli_trmm_query.h b/frame/3/trmm/bli_trmm_query.h new file mode 100644 index 000000000..27e468b31 --- /dev/null +++ b/frame/3/trmm/bli_trmm_query.h @@ -0,0 +1,36 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +func_t* bli_trmm_query_ukrs( num_t dt ); +char* bli_trmm_query_impl_string( num_t dt ); diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 8c1760649..23ebb15d9 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index f7894e584..ee3d4344d 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm3/3mh/bli_trmm33mh.c b/frame/3/trmm3/3mh/bli_trmm33mh.c new file mode 100644 index 000000000..8c764a857 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh.c @@ -0,0 +1,109 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_trmm33mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_trmm33mh_entry( side, alpha, a, b, beta, c ); + else + bli_trmm3_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( trmm33mh, trmm33mh ) + diff --git a/frame/3/trmm3/3mh/bli_trmm33mh.h b/frame/3/trmm3/3mh/bli_trmm33mh.h new file mode 100644 index 000000000..ba2523d34 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trmm33mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_trmm33mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( trmm33mh ) + diff --git a/frame/3/trmm3/3mh/bli_trmm33mh_entry.c b/frame/3/trmm3/3mh/bli_trmm33mh_entry.c new file mode 100644 index 000000000..4dd92d9c2 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_trmm33mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_trmm3_front( side, alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/trmm3/3mh/bli_trmm33mh_entry.h b/frame/3/trmm3/3mh/bli_trmm33mh_entry.h new file mode 100644 index 000000000..0e8934336 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trmm33mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh.c b/frame/3/trmm3/4mh/bli_trmm34mh.c new file mode 100644 index 000000000..bcf256429 --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh.c @@ -0,0 +1,109 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_trmm34mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_trmm34mh_entry( side, alpha, a, b, beta, c ); + else + bli_trmm3_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( trmm34mh, trmm34mh ) + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh.h b/frame/3/trmm3/4mh/bli_trmm34mh.h new file mode 100644 index 000000000..0e3e3f43b --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trmm34mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_trmm34mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( trmm34mh ) + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh_entry.c b/frame/3/trmm3/4mh/bli_trmm34mh_entry.c new file mode 100644 index 000000000..63548c2ad --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh_entry.c @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_trmm34mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_trmm3_front( side, alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh_entry.h b/frame/3/trmm3/4mh/bli_trmm34mh_entry.h new file mode 100644 index 000000000..818e1fdf8 --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trmm34mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/trmm3/bli_trmm3.c b/frame/3/trmm3/bli_trmm3.c index 37b588662..510908ff5 100644 --- a/frame/3/trmm3/bli_trmm3.c +++ b/frame/3/trmm3/bli_trmm3.c @@ -44,12 +44,16 @@ void bli_trmm3( side_t side, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_trmm34m_entry( side, alpha, a, b, beta, c ); - else - bli_trmm3_entry( side, alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_trmm33mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_trmm33m_entry( side, alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_trmm34mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_trmm34m_entry( side, alpha, a, b, beta, c ); + else bli_trmm3_entry( side, alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/trmm3/bli_trmm3.h b/frame/3/trmm3/bli_trmm3.h index e7e039d8a..ff53dece5 100644 --- a/frame/3/trmm3/bli_trmm3.h +++ b/frame/3/trmm3/bli_trmm3.h @@ -38,6 +38,8 @@ #include "bli_trmm34m.h" #include "bli_trmm33m.h" +#include "bli_trmm34mh.h" +#include "bli_trmm33mh.h" // diff --git a/frame/3/trsm/3m/bli_trsm3m_cntl.c b/frame/3/trsm/3m/bli_trsm3m_cntl.c index de00c536b..87acfb5b0 100644 --- a/frame/3/trsm/3m/bli_trsm3m_cntl.c +++ b/frame/3/trsm/3m/bli_trsm3m_cntl.c @@ -48,6 +48,9 @@ extern func_t* gemm3m_ukrs; func_t* gemmtrsm3m_l_ukrs; func_t* gemmtrsm3m_u_ukrs; +func_t* trsm3m_l_ukrs; +func_t* trsm3m_u_ukrs; + packm_t* trsm3m_l_packa_cntl; packm_t* trsm3m_l_packb_cntl; @@ -88,6 +91,23 @@ void bli_trsm3m_cntl_init() BLIS_ZGEMMTRSM3M_U_UKERNEL, FALSE ); + // Create function pointer objects for each datatype-specific + // trsm3m_l and trsm3m_u micro-kernel. + trsm3m_l_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM3M_L_UKERNEL, FALSE, + BLIS_ZTRSM3M_L_UKERNEL, FALSE ); + + trsm3m_u_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM3M_U_UKERNEL, FALSE, + BLIS_ZTRSM3M_U_UKERNEL, FALSE ); + + // Create control tree objects for packm operations (left side). trsm3m_l_packa_cntl = @@ -260,6 +280,8 @@ void bli_trsm3m_cntl_finalize() { bli_func_obj_free( gemmtrsm3m_l_ukrs ); bli_func_obj_free( gemmtrsm3m_u_ukrs ); + bli_func_obj_free( trsm3m_l_ukrs ); + bli_func_obj_free( trsm3m_u_ukrs ); bli_cntl_obj_free( trsm3m_l_packa_cntl ); bli_cntl_obj_free( trsm3m_l_packb_cntl ); diff --git a/frame/3/trsm/4m/bli_trsm4m_cntl.c b/frame/3/trsm/4m/bli_trsm4m_cntl.c index 519d2ad26..7353d8f53 100644 --- a/frame/3/trsm/4m/bli_trsm4m_cntl.c +++ b/frame/3/trsm/4m/bli_trsm4m_cntl.c @@ -48,6 +48,9 @@ extern func_t* gemm4m_ukrs; func_t* gemmtrsm4m_l_ukrs; func_t* gemmtrsm4m_u_ukrs; +func_t* trsm4m_l_ukrs; +func_t* trsm4m_u_ukrs; + packm_t* trsm4m_l_packa_cntl; packm_t* trsm4m_l_packb_cntl; @@ -88,6 +91,22 @@ void bli_trsm4m_cntl_init() BLIS_ZGEMMTRSM4M_U_UKERNEL, FALSE ); + // Create function pointer objects for each datatype-specific + // trsm4m_l and trsm4m_u micro-kernel. + trsm4m_l_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM4M_L_UKERNEL, FALSE, + BLIS_ZTRSM4M_L_UKERNEL, FALSE ); + + trsm4m_u_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM4M_U_UKERNEL, FALSE, + BLIS_ZTRSM4M_U_UKERNEL, FALSE ); + // Create control tree objects for packm operations (left side). trsm4m_l_packa_cntl @@ -261,6 +280,8 @@ void bli_trsm4m_cntl_finalize() { bli_func_obj_free( gemmtrsm4m_l_ukrs ); bli_func_obj_free( gemmtrsm4m_u_ukrs ); + bli_func_obj_free( trsm4m_l_ukrs ); + bli_func_obj_free( trsm4m_u_ukrs ); bli_cntl_obj_free( trsm4m_l_packa_cntl ); bli_cntl_obj_free( trsm4m_l_packb_cntl ); diff --git a/frame/3/trsm/bli_trsm.c b/frame/3/trsm/bli_trsm.c index 14c4983b1..4b74acf61 100644 --- a/frame/3/trsm/bli_trsm.c +++ b/frame/3/trsm/bli_trsm.c @@ -42,12 +42,14 @@ void bli_trsm( side_t side, obj_t* a, obj_t* b ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *b ) ) ) - bli_trsm4m_entry( side, alpha, a, b ); - else - bli_trsm_entry( side, alpha, a, b ); + num_t dt = bli_obj_datatype( *b ); + + if ( bli_3m_is_enabled_dt( dt ) ) bli_trsm3m_entry( side, alpha, a, b ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_trsm4m_entry( side, alpha, a, b ); + else bli_trsm_entry( side, alpha, a, b ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h index 9b434288d..c66c2ee74 100644 --- a/frame/3/trsm/bli_trsm.h +++ b/frame/3/trsm/bli_trsm.h @@ -33,6 +33,7 @@ */ #include "bli_trsm_cntl.h" +#include "bli_trsm_query.h" #include "bli_trsm_check.h" #include "bli_trsm_entry.h" #include "bli_trsm_front.h" diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index b78899bca..915150bbe 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -50,6 +50,9 @@ extern gemm_t* gemm_cntl_bp_ke; func_t* gemmtrsm_l_ukrs; func_t* gemmtrsm_u_ukrs; +func_t* trsm_l_ukrs; +func_t* trsm_u_ukrs; + packm_t* trsm_l_packa_cntl; packm_t* trsm_l_packb_cntl; @@ -90,6 +93,23 @@ void bli_trsm_cntl_init() BLIS_ZGEMMTRSM_U_UKERNEL, FALSE ); + // Create function pointer objects for each datatype-specific + // trsm_l and trsm_u micro-kernel. + trsm_l_ukrs + = + bli_func_obj_create( BLIS_STRSM_L_UKERNEL, FALSE, + BLIS_DTRSM_L_UKERNEL, FALSE, + BLIS_CTRSM_L_UKERNEL, FALSE, + BLIS_ZTRSM_L_UKERNEL, FALSE ); + + trsm_u_ukrs + = + bli_func_obj_create( BLIS_STRSM_U_UKERNEL, FALSE, + BLIS_DTRSM_U_UKERNEL, FALSE, + BLIS_CTRSM_U_UKERNEL, FALSE, + BLIS_ZTRSM_U_UKERNEL, FALSE ); + + // Create control tree objects for packm operations (left side). trsm_l_packa_cntl = @@ -262,6 +282,8 @@ void bli_trsm_cntl_finalize() { bli_func_obj_free( gemmtrsm_l_ukrs ); bli_func_obj_free( gemmtrsm_u_ukrs ); + bli_func_obj_free( trsm_l_ukrs ); + bli_func_obj_free( trsm_u_ukrs ); bli_cntl_obj_free( trsm_l_packa_cntl ); bli_cntl_obj_free( trsm_l_packb_cntl ); diff --git a/frame/3/trsm/bli_trsm_query.c b/frame/3/trsm/bli_trsm_query.c new file mode 100644 index 000000000..0a7ba82ae --- /dev/null +++ b/frame/3/trsm/bli_trsm_query.c @@ -0,0 +1,171 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern func_t* gemmtrsm3m_l_ukrs; +extern func_t* gemmtrsm3m_u_ukrs; +extern func_t* gemmtrsm4m_l_ukrs; +extern func_t* gemmtrsm4m_u_ukrs; +extern func_t* gemmtrsm_l_ukrs; +extern func_t* gemmtrsm_u_ukrs; + +extern func_t* trsm3m_l_ukrs; +extern func_t* trsm3m_u_ukrs; +extern func_t* trsm4m_l_ukrs; +extern func_t* trsm4m_u_ukrs; +extern func_t* trsm_l_ukrs; +extern func_t* trsm_u_ukrs; + +func_t* bli_gemmtrsm_query_ukrs( uplo_t uplo, num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? gemmtrsm3m_l_ukrs + : gemmtrsm3m_u_ukrs ); + else if ( bli_4m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? gemmtrsm4m_l_ukrs + : gemmtrsm4m_u_ukrs ); + else + return ( bli_is_lower( uplo ) ? gemmtrsm_l_ukrs + : gemmtrsm_u_ukrs ); +} + +func_t* bli_trsm_query_ukrs( uplo_t uplo, num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? trsm3m_l_ukrs + : trsm3m_u_ukrs ); + else if ( bli_4m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? trsm4m_l_ukrs + : trsm4m_u_ukrs ); + else + return ( bli_is_lower( uplo ) ? trsm_l_ukrs + : trsm_u_ukrs ); +} + +char* bli_trsm_query_impl_string( num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) return bli_3m_get_string(); + else if ( bli_4m_is_enabled_dt( dt ) ) return bli_4m_get_string(); + else return bli_native_get_string(); +} + +kimpl_t bli_gemmtrsm_l_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_gemmtrsm_query_ukrs( BLIS_LOWER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_SGEMMTRSM_L_UKERNEL_REF || + p == BLIS_DGEMMTRSM_L_UKERNEL_REF || + p == BLIS_CGEMMTRSM_L_UKERNEL_REF || + p == BLIS_ZGEMMTRSM_L_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM3M_L_UKERNEL_REF || + p == BLIS_ZGEMMTRSM3M_L_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM4M_L_UKERNEL_REF || + p == BLIS_ZGEMMTRSM4M_L_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + +kimpl_t bli_gemmtrsm_u_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_gemmtrsm_query_ukrs( BLIS_UPPER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_SGEMMTRSM_U_UKERNEL_REF || + p == BLIS_DGEMMTRSM_U_UKERNEL_REF || + p == BLIS_CGEMMTRSM_U_UKERNEL_REF || + p == BLIS_ZGEMMTRSM_U_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM3M_U_UKERNEL_REF || + p == BLIS_ZGEMMTRSM3M_U_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM4M_U_UKERNEL_REF || + p == BLIS_ZGEMMTRSM4M_U_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + +kimpl_t bli_trsm_l_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_trsm_query_ukrs( BLIS_LOWER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_STRSM_L_UKERNEL_REF || + p == BLIS_DTRSM_L_UKERNEL_REF || + p == BLIS_CTRSM_L_UKERNEL_REF || + p == BLIS_ZTRSM_L_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CTRSM3M_L_UKERNEL_REF || + p == BLIS_ZTRSM3M_L_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CTRSM4M_L_UKERNEL_REF || + p == BLIS_ZTRSM4M_L_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + +kimpl_t bli_trsm_u_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_trsm_query_ukrs( BLIS_UPPER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_STRSM_U_UKERNEL_REF || + p == BLIS_DTRSM_U_UKERNEL_REF || + p == BLIS_CTRSM_U_UKERNEL_REF || + p == BLIS_ZTRSM_U_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CTRSM3M_U_UKERNEL_REF || + p == BLIS_ZTRSM3M_U_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CTRSM4M_U_UKERNEL_REF || + p == BLIS_ZTRSM4M_U_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + diff --git a/frame/3/trsm/bli_trsm_query.h b/frame/3/trsm/bli_trsm_query.h new file mode 100644 index 000000000..31f2043b8 --- /dev/null +++ b/frame/3/trsm/bli_trsm_query.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +func_t* bli_gemmtrsm_query_ukrs( uplo_t uplo, num_t dt ); +func_t* bli_trsm_query_ukrs( uplo_t uplo, num_t dt ); +char* bli_trsm_query_impl_string( num_t dt ); + +kimpl_t bli_gemmtrsm_l_ukernel_impl_type( num_t dt ); +kimpl_t bli_gemmtrsm_u_ukernel_impl_type( num_t dt ); +kimpl_t bli_trsm_l_ukernel_impl_type( num_t dt ); +kimpl_t bli_trsm_u_ukernel_impl_type( num_t dt ); diff --git a/frame/base/bli_3m.c b/frame/base/bli_3m.c new file mode 100644 index 000000000..c8f8420e9 --- /dev/null +++ b/frame/base/bli_3m.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static char* bli_3m_str = "3m"; + +static bool_t bli_will_use_3m_c = FALSE; +static bool_t bli_will_use_3m_z = FALSE; + + +char* bli_3m_get_string( void ) { return bli_3m_str; } + +bool_t bli_3m_is_enabled_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) return bli_3m_is_enabled_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_3m_is_enabled_z(); + else return FALSE; +} +bool_t bli_3m_is_enabled_c( void ) { return bli_will_use_3m_c; } +bool_t bli_3m_is_enabled_z( void ) { return bli_will_use_3m_z; } + + +void bli_3m_enable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3m_enable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3m_enable_z(); +} +void bli_3m_enable_c( void ) { bli_will_use_3m_c = TRUE; } +void bli_3m_enable_z( void ) { bli_will_use_3m_z = TRUE; } +void bli_3m_enable( void ) { bli_will_use_3m_c = + bli_will_use_3m_z = TRUE; } + + +void bli_3m_disable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3m_disable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3m_disable_z(); +} + +void bli_3m_disable_c( void ) { bli_will_use_3m_c = FALSE; } +void bli_3m_disable_z( void ) { bli_will_use_3m_z = FALSE; } +void bli_3m_disable( void ) { bli_will_use_3m_c = + bli_will_use_3m_z = FALSE; } diff --git a/frame/base/bli_3m.h b/frame/base/bli_3m.h new file mode 100644 index 000000000..7a3557777 --- /dev/null +++ b/frame/base/bli_3m.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +char* bli_3m_get_string( void ); + +bool_t bli_3m_is_enabled_dt( num_t dt ); +bool_t bli_3m_is_enabled_c( void ); +bool_t bli_3m_is_enabled_z( void ); + +void bli_3m_enable_dt( num_t dt ); +void bli_3m_enable_c( void ); +void bli_3m_enable_z( void ); +void bli_3m_enable( void ); + +void bli_3m_disable_dt( num_t dt ); +void bli_3m_disable_c( void ); +void bli_3m_disable_z( void ); +void bli_3m_disable( void ); diff --git a/frame/base/bli_3mh.c b/frame/base/bli_3mh.c new file mode 100644 index 000000000..6cabb69dc --- /dev/null +++ b/frame/base/bli_3mh.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static char* bli_3mh_str = "3mh"; + +static bool_t bli_will_use_3mh_c = FALSE; +static bool_t bli_will_use_3mh_z = FALSE; + + +char* bli_3mh_get_string( void ) { return bli_3mh_str; } + +bool_t bli_3mh_is_enabled_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) return bli_3mh_is_enabled_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_3mh_is_enabled_z(); + else return FALSE; +} +bool_t bli_3mh_is_enabled_c( void ) { return bli_will_use_3mh_c; } +bool_t bli_3mh_is_enabled_z( void ) { return bli_will_use_3mh_z; } + + +void bli_3mh_enable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3mh_enable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3mh_enable_z(); +} +void bli_3mh_enable_c( void ) { bli_will_use_3mh_c = TRUE; } +void bli_3mh_enable_z( void ) { bli_will_use_3mh_z = TRUE; } +void bli_3mh_enable( void ) { bli_will_use_3mh_c = + bli_will_use_3mh_z = TRUE; } + + +void bli_3mh_disable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3mh_disable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3mh_disable_z(); +} + +void bli_3mh_disable_c( void ) { bli_will_use_3mh_c = FALSE; } +void bli_3mh_disable_z( void ) { bli_will_use_3mh_z = FALSE; } +void bli_3mh_disable( void ) { bli_will_use_3mh_c = + bli_will_use_3mh_z = FALSE; } diff --git a/frame/base/bli_3mh.h b/frame/base/bli_3mh.h new file mode 100644 index 000000000..4b9798275 --- /dev/null +++ b/frame/base/bli_3mh.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +char* bli_3mh_get_string( void ); + +bool_t bli_3mh_is_enabled_dt( num_t dt ); +bool_t bli_3mh_is_enabled_c( void ); +bool_t bli_3mh_is_enabled_z( void ); + +void bli_3mh_enable_dt( num_t dt ); +void bli_3mh_enable_c( void ); +void bli_3mh_enable_z( void ); +void bli_3mh_enable( void ); + +void bli_3mh_disable_dt( num_t dt ); +void bli_3mh_disable_c( void ); +void bli_3mh_disable_z( void ); +void bli_3mh_disable( void ); diff --git a/frame/base/bli_4m.c b/frame/base/bli_4m.c index f188b5300..7fe0b698b 100644 --- a/frame/base/bli_4m.c +++ b/frame/base/bli_4m.c @@ -34,23 +34,28 @@ #include "blis.h" +static char* bli_native_str = "native"; +static char* bli_4m_str = "4m"; + // Initialize the 4m enabled/disabled state based on the cpp macros -// BLIS_ENABLE_SCOMPLEX_VIA_4M and BLIS_ENABLE_DCOMPLEX_VIA_4M, which -// are set in bli_kernel_macro_defs.h. -#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M +// which are set in bli_kernel_macro_defs.h. +#ifdef BLIS_ENABLE_VIRTUAL_SCOMPLEX static bool_t bli_will_use_4m_c = TRUE; #else static bool_t bli_will_use_4m_c = FALSE; #endif -#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M +#ifdef BLIS_ENABLE_VIRTUAL_DCOMPLEX static bool_t bli_will_use_4m_z = TRUE; #else static bool_t bli_will_use_4m_z = FALSE; #endif -bool_t bli_4m_is_enabled( num_t dt ) +char* bli_native_get_string( void ) { return bli_native_str; } +char* bli_4m_get_string( void ) { return bli_4m_str; } + +bool_t bli_4m_is_enabled_dt( num_t dt ) { if ( bli_is_scomplex( dt ) ) return bli_4m_is_enabled_c(); else if ( bli_is_dcomplex( dt ) ) return bli_4m_is_enabled_z(); @@ -60,18 +65,18 @@ bool_t bli_4m_is_enabled_c( void ) { return bli_will_use_4m_c; } bool_t bli_4m_is_enabled_z( void ) { return bli_will_use_4m_z; } -void bli_4m_enable( num_t dt ) +void bli_4m_enable_dt( num_t dt ) { if ( bli_is_scomplex( dt ) ) bli_4m_enable_c(); else if ( bli_is_dcomplex( dt ) ) bli_4m_enable_z(); } void bli_4m_enable_c( void ) { bli_will_use_4m_c = TRUE; } void bli_4m_enable_z( void ) { bli_will_use_4m_z = TRUE; } -void bli_4m_enable_cz( void ) { bli_will_use_4m_c = +void bli_4m_enable( void ) { bli_will_use_4m_c = bli_will_use_4m_z = TRUE; } -void bli_4m_disable( num_t dt ) +void bli_4m_disable_dt( num_t dt ) { if ( bli_is_scomplex( dt ) ) bli_4m_disable_c(); else if ( bli_is_dcomplex( dt ) ) bli_4m_disable_z(); @@ -79,5 +84,5 @@ void bli_4m_disable( num_t dt ) void bli_4m_disable_c( void ) { bli_will_use_4m_c = FALSE; } void bli_4m_disable_z( void ) { bli_will_use_4m_z = FALSE; } -void bli_4m_disable_cz( void ) { bli_will_use_4m_c = +void bli_4m_disable( void ) { bli_will_use_4m_c = bli_will_use_4m_z = FALSE; } diff --git a/frame/base/bli_4m.h b/frame/base/bli_4m.h index 9791d70b8..db497820e 100644 --- a/frame/base/bli_4m.h +++ b/frame/base/bli_4m.h @@ -32,16 +32,19 @@ */ -bool_t bli_4m_is_enabled( num_t dt ); +char* bli_native_get_string( void ); +char* bli_4m_get_string( void ); + +bool_t bli_4m_is_enabled_dt( num_t dt ); bool_t bli_4m_is_enabled_c( void ); bool_t bli_4m_is_enabled_z( void ); -void bli_4m_enable( num_t dt ); +void bli_4m_enable_dt( num_t dt ); void bli_4m_enable_c( void ); void bli_4m_enable_z( void ); -void bli_4m_enable_cz( void ); +void bli_4m_enable( void ); -void bli_4m_disable( num_t dt ); +void bli_4m_disable_dt( num_t dt ); void bli_4m_disable_c( void ); void bli_4m_disable_z( void ); -void bli_4m_disable_cz( void ); +void bli_4m_disable( void ); diff --git a/frame/base/bli_4mh.c b/frame/base/bli_4mh.c new file mode 100644 index 000000000..110961b84 --- /dev/null +++ b/frame/base/bli_4mh.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static char* bli_4mh_str = "4mh"; + +static bool_t bli_will_use_4mh_c = FALSE; +static bool_t bli_will_use_4mh_z = FALSE; + + +char* bli_4mh_get_string( void ) { return bli_4mh_str; } + +bool_t bli_4mh_is_enabled_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) return bli_4mh_is_enabled_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_4mh_is_enabled_z(); + else return FALSE; +} +bool_t bli_4mh_is_enabled_c( void ) { return bli_will_use_4mh_c; } +bool_t bli_4mh_is_enabled_z( void ) { return bli_will_use_4mh_z; } + + +void bli_4mh_enable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_4mh_enable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_4mh_enable_z(); +} +void bli_4mh_enable_c( void ) { bli_will_use_4mh_c = TRUE; } +void bli_4mh_enable_z( void ) { bli_will_use_4mh_z = TRUE; } +void bli_4mh_enable( void ) { bli_will_use_4mh_c = + bli_will_use_4mh_z = TRUE; } + + +void bli_4mh_disable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_4mh_disable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_4mh_disable_z(); +} + +void bli_4mh_disable_c( void ) { bli_will_use_4mh_c = FALSE; } +void bli_4mh_disable_z( void ) { bli_will_use_4mh_z = FALSE; } +void bli_4mh_disable( void ) { bli_will_use_4mh_c = + bli_will_use_4mh_z = FALSE; } diff --git a/frame/base/bli_4mh.h b/frame/base/bli_4mh.h new file mode 100644 index 000000000..050086732 --- /dev/null +++ b/frame/base/bli_4mh.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +char* bli_4mh_get_string( void ); + +bool_t bli_4mh_is_enabled_dt( num_t dt ); +bool_t bli_4mh_is_enabled_c( void ); +bool_t bli_4mh_is_enabled_z( void ); + +void bli_4mh_enable_dt( num_t dt ); +void bli_4mh_enable_c( void ); +void bli_4mh_enable_z( void ); +void bli_4mh_enable( void ); + +void bli_4mh_disable_dt( num_t dt ); +void bli_4mh_disable_c( void ); +void bli_4mh_disable_z( void ); +void bli_4mh_disable( void ); diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index d22c5911b..2f8fa70ac 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -421,7 +421,7 @@ gint_t bli_info_get_dotxf_fuse_fac_c( void ) { return BLIS_DOTXF_FUSE_FAC_C; } gint_t bli_info_get_dotxf_fuse_fac_z( void ) { return BLIS_DOTXF_FUSE_FAC_Z; } -// dotxf +// dotxaxpyf gint_t bli_info_get_dotxaxpyf_fuse_fac( num_t dt ) { @@ -437,6 +437,39 @@ gint_t bli_info_get_dotxaxpyf_fuse_fac_c( void ) { return BLIS_DOTXAXPYF_FUSE_FA gint_t bli_info_get_dotxaxpyf_fuse_fac_z( void ) { return BLIS_DOTXAXPYF_FUSE_FAC_Z; } +// -- Level-3 kernel definitions -- + +static char* ukr_type_str[4] = { "refnce", + "virt4m", + "virt3m", + "optmzd" }; + +char* bli_info_get_gemm_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_gemm_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_gemmtrsm_l_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_gemmtrsm_l_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_gemmtrsm_u_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_gemmtrsm_u_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_trsm_l_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_trsm_l_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_trsm_u_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_trsm_u_ukernel_impl_type( dt ) ]; +} + + // -- bli_mem_pool_macro_defs.h ------------------------------------------------ @@ -444,3 +477,18 @@ gint_t bli_info_get_mk_pool_size( void ) { return BLIS_MK_POOL_SIZE; } gint_t bli_info_get_kn_pool_size( void ) { return BLIS_KN_POOL_SIZE; } gint_t bli_info_get_mn_pool_size( void ) { return BLIS_MN_POOL_SIZE; } + + +// -- BLIS implementation query (level-3) -------------------------------------- + +char* bli_info_get_gemm_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_hemm_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_herk_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_her2k_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_symm_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_syrk_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_syr2k_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_trmm_impl_string( num_t dt ) { bli_init(); return bli_trmm_query_impl_string( dt ); } +char* bli_info_get_trmm3_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_trsm_impl_string( num_t dt ) { bli_init(); return bli_trsm_query_impl_string( dt ); } + diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 7c087e6d7..7e0dbb2a5 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -181,9 +181,32 @@ gint_t bli_info_get_dotxaxpyf_fuse_fac_c( void ); gint_t bli_info_get_dotxaxpyf_fuse_fac_z( void ); +// -- Level-3 kernel definitions -- + +char* bli_info_get_gemm_ukr_type( num_t dt ); +char* bli_info_get_gemmtrsm_l_ukr_type( num_t dt ); +char* bli_info_get_gemmtrsm_u_ukr_type( num_t dt ); +char* bli_info_get_trsm_l_ukr_type( num_t dt ); +char* bli_info_get_trsm_u_ukr_type( num_t dt ); + + // -- bli_mem_pool_macro_defs.h ------------------------------------------------ gint_t bli_info_get_mk_pool_size( void ); gint_t bli_info_get_kn_pool_size( void ); gint_t bli_info_get_mn_pool_size( void ); + +// -- BLIS implementation query (level-3) -------------------------------------- + +char* bli_info_get_gemm_impl_string( num_t dt ); +char* bli_info_get_hemm_impl_string( num_t dt ); +char* bli_info_get_herk_impl_string( num_t dt ); +char* bli_info_get_her2k_impl_string( num_t dt ); +char* bli_info_get_symm_impl_string( num_t dt ); +char* bli_info_get_syrk_impl_string( num_t dt ); +char* bli_info_get_syr2k_impl_string( num_t dt ); +char* bli_info_get_trmm_impl_string( num_t dt ); +char* bli_info_get_trmm3_impl_string( num_t dt ); +char* bli_info_get_trsm_impl_string( num_t dt ); + diff --git a/frame/cntl/bli_cntl_init.c b/frame/cntl/bli_cntl_init.c index 3f885cd7e..cec9ffd0f 100644 --- a/frame/cntl/bli_cntl_init.c +++ b/frame/cntl/bli_cntl_init.c @@ -66,6 +66,12 @@ void bli_cntl_init( void ) // Level-3 via 3m bli_gemm3m_cntl_init(); bli_trsm3m_cntl_init(); + + // Level-3 via 4mh + bli_gemm4mh_cntl_init(); + + // Level-3 via 3mh + bli_gemm3mh_cntl_init(); } void bli_cntl_finalize( void ) @@ -100,5 +106,11 @@ void bli_cntl_finalize( void ) // Level-3 via 3m bli_gemm3m_cntl_finalize(); bli_trsm3m_cntl_finalize(); + + // Level-3 via 4mh + bli_gemm4mh_cntl_finalize(); + + // Level-3 via 3mh + bli_gemm3mh_cntl_finalize(); } diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 67502abc2..8b784e908 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -93,14 +93,15 @@ #ifndef BLIS_CGEMM_UKERNEL #define BLIS_CGEMM_UKERNEL BLIS_CGEMM_UKERNEL_REF #ifdef BLIS_SGEMM_UKERNEL -#define BLIS_ENABLE_SCOMPLEX_VIA_4M +#define BLIS_ENABLE_VIRTUAL_SCOMPLEX #endif +#else #endif #ifndef BLIS_ZGEMM_UKERNEL #define BLIS_ZGEMM_UKERNEL BLIS_ZGEMM_UKERNEL_REF #ifdef BLIS_DGEMM_UKERNEL -#define BLIS_ENABLE_DCOMPLEX_VIA_4M +#define BLIS_ENABLE_VIRTUAL_DCOMPLEX #endif #endif diff --git a/frame/include/bli_kernel_pre_macro_defs.h b/frame/include/bli_kernel_pre_macro_defs.h index cfaae8d3c..47d2d3010 100644 --- a/frame/include/bli_kernel_pre_macro_defs.h +++ b/frame/include/bli_kernel_pre_macro_defs.h @@ -135,6 +135,24 @@ #define BLIS_ZTRSM3M_U_UKERNEL_REF bli_ztrsm3m_u_ukr_ref // +// Level-3 4mh +// + +// gemm4mh micro-kernels + +#define BLIS_CGEMM4MH_UKERNEL_REF bli_cgemm4mh_ukr_ref +#define BLIS_ZGEMM4MH_UKERNEL_REF bli_zgemm4mh_ukr_ref + +// +// +// Level-3 3mh +// + +// gemm3mh micro-kernels + +#define BLIS_CGEMM3MH_UKERNEL_REF bli_cgemm3mh_ukr_ref +#define BLIS_ZGEMM3MH_UKERNEL_REF bli_zgemm3mh_ukr_ref + // Level-1m // @@ -274,6 +292,46 @@ #define BLIS_CPACKM_16XK_3M_KERNEL_REF bli_cpackm_ref_16xk_3m #define BLIS_ZPACKM_16XK_3M_KERNEL_REF bli_zpackm_ref_16xk_3m +// packm_2xk_rih kernels + +#define BLIS_CPACKM_2XK_RIH_KERNEL_REF bli_cpackm_ref_2xk_rih +#define BLIS_ZPACKM_2XK_RIH_KERNEL_REF bli_zpackm_ref_2xk_rih + +// packm_4xk_rih kernels + +#define BLIS_CPACKM_4XK_RIH_KERNEL_REF bli_cpackm_ref_4xk_rih +#define BLIS_ZPACKM_4XK_RIH_KERNEL_REF bli_zpackm_ref_4xk_rih + +// packm_6xk_rih kernels + +#define BLIS_CPACKM_6XK_RIH_KERNEL_REF bli_cpackm_ref_6xk_rih +#define BLIS_ZPACKM_6XK_RIH_KERNEL_REF bli_zpackm_ref_6xk_rih + +// packm_8xk_rih kernels + +#define BLIS_CPACKM_8XK_RIH_KERNEL_REF bli_cpackm_ref_8xk_rih +#define BLIS_ZPACKM_8XK_RIH_KERNEL_REF bli_zpackm_ref_8xk_rih + +// packm_10xk_rih kernels + +#define BLIS_CPACKM_10XK_RIH_KERNEL_REF bli_cpackm_ref_10xk_rih +#define BLIS_ZPACKM_10XK_RIH_KERNEL_REF bli_zpackm_ref_10xk_rih + +// packm_12xk_rih kernels + +#define BLIS_CPACKM_12XK_RIH_KERNEL_REF bli_cpackm_ref_12xk_rih +#define BLIS_ZPACKM_12XK_RIH_KERNEL_REF bli_zpackm_ref_12xk_rih + +// packm_14xk_rih kernels + +#define BLIS_CPACKM_14XK_RIH_KERNEL_REF bli_cpackm_ref_14xk_rih +#define BLIS_ZPACKM_14XK_RIH_KERNEL_REF bli_zpackm_ref_14xk_rih + +// packm_16xk_rih kernels + +#define BLIS_CPACKM_16XK_RIH_KERNEL_REF bli_cpackm_ref_16xk_rih +#define BLIS_ZPACKM_16XK_RIH_KERNEL_REF bli_zpackm_ref_16xk_rih + // unpack_2xk kernels #define BLIS_SUNPACKM_2XK_KERNEL_REF bli_sunpackm_ref_2xk diff --git a/frame/include/bli_kernel_rih_macro_defs.h b/frame/include/bli_kernel_rih_macro_defs.h new file mode 100644 index 000000000..fae464810 --- /dev/null +++ b/frame/include/bli_kernel_rih_macro_defs.h @@ -0,0 +1,168 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_RIH_MACRO_DEFS_H +#define BLIS_KERNEL_RIH_MACRO_DEFS_H + + +// -- Define 4mh/3mh row access bools ------------------------------------------ + +// gemm4mh micro-kernels + +#define BLIS_CGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + +// gemm3mh micro-kernels + +#define BLIS_CGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + + +// -- Define default 4mh/3mh-specific kernel names ----------------------------- + +// +// Level-3 +// + +// gemm4mh micro-kernels + +#ifndef BLIS_CGEMM4MH_UKERNEL +#define BLIS_CGEMM4MH_UKERNEL BLIS_CGEMM4MH_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM4MH_UKERNEL +#define BLIS_ZGEMM4MH_UKERNEL BLIS_ZGEMM4MH_UKERNEL_REF +#endif + +// gemm3mh micro-kernels + +#ifndef BLIS_CGEMM3MH_UKERNEL +#define BLIS_CGEMM3MH_UKERNEL BLIS_CGEMM3MH_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM3MH_UKERNEL +#define BLIS_ZGEMM3MH_UKERNEL BLIS_ZGEMM3MH_UKERNEL_REF +#endif + +// +// Level-1m +// + +// packm_2xk_rih kernels + +#ifndef BLIS_CPACKM_2XK_RIH_KERNEL +#define BLIS_CPACKM_2XK_RIH_KERNEL BLIS_CPACKM_2XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_RIH_KERNEL +#define BLIS_ZPACKM_2XK_RIH_KERNEL BLIS_ZPACKM_2XK_RIH_KERNEL_REF +#endif + +// packm_4xk_rih kernels + +#ifndef BLIS_CPACKM_4XK_RIH_KERNEL +#define BLIS_CPACKM_4XK_RIH_KERNEL BLIS_CPACKM_4XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_RIH_KERNEL +#define BLIS_ZPACKM_4XK_RIH_KERNEL BLIS_ZPACKM_4XK_RIH_KERNEL_REF +#endif + +// packm_6xk_rih kernels + +#ifndef BLIS_CPACKM_6XK_RIH_KERNEL +#define BLIS_CPACKM_6XK_RIH_KERNEL BLIS_CPACKM_6XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_RIH_KERNEL +#define BLIS_ZPACKM_6XK_RIH_KERNEL BLIS_ZPACKM_6XK_RIH_KERNEL_REF +#endif + +// packm_8xk_rih kernels + +#ifndef BLIS_CPACKM_8XK_RIH_KERNEL +#define BLIS_CPACKM_8XK_RIH_KERNEL BLIS_CPACKM_8XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_RIH_KERNEL +#define BLIS_ZPACKM_8XK_RIH_KERNEL BLIS_ZPACKM_8XK_RIH_KERNEL_REF +#endif + +// packm_10xk_rih kernels + +#ifndef BLIS_CPACKM_10XK_RIH_KERNEL +#define BLIS_CPACKM_10XK_RIH_KERNEL BLIS_CPACKM_10XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_RIH_KERNEL +#define BLIS_ZPACKM_10XK_RIH_KERNEL BLIS_ZPACKM_10XK_RIH_KERNEL_REF +#endif + +// packm_12xk_rih kernels + +#ifndef BLIS_CPACKM_12XK_RIH_KERNEL +#define BLIS_CPACKM_12XK_RIH_KERNEL BLIS_CPACKM_12XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_RIH_KERNEL +#define BLIS_ZPACKM_12XK_RIH_KERNEL BLIS_ZPACKM_12XK_RIH_KERNEL_REF +#endif + +// packm_14xk_rih kernels + +#ifndef BLIS_CPACKM_14XK_RIH_KERNEL +#define BLIS_CPACKM_14XK_RIH_KERNEL BLIS_CPACKM_14XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_RIH_KERNEL +#define BLIS_ZPACKM_14XK_RIH_KERNEL BLIS_ZPACKM_14XK_RIH_KERNEL_REF +#endif + +// packm_16xk_rih kernels + +#ifndef BLIS_CPACKM_16XK_RIH_KERNEL +#define BLIS_CPACKM_16XK_RIH_KERNEL BLIS_CPACKM_16XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_RIH_KERNEL +#define BLIS_ZPACKM_16XK_RIH_KERNEL BLIS_ZPACKM_16XK_RIH_KERNEL_REF +#endif + + + +#endif diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 1199f219c..5c4195932 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -221,6 +221,24 @@ \ ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3M ) +#define bli_obj_is_ro_packed( obj ) \ +\ + ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RO ) + +#define bli_obj_is_io_packed( obj ) \ +\ + ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_IO ) + +#define bli_obj_is_rpi_packed( obj ) \ +\ + ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RPI ) + +#define bli_obj_is_rih_packed( obj ) \ +\ + ( bli_obj_is_ro_packed( obj ) || \ + bli_obj_is_io_packed( obj ) || \ + bli_obj_is_rpi_packed( obj ) ) + #define bli_obj_pack_buffer_type( obj ) \ \ ( (obj).info & BLIS_PACK_BUFFER_BITS ) diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index e1fd135a5..0a1e1cce8 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -356,7 +356,6 @@ ( bli_does_notrans( trans ) ? ( m == 1 ? (cs) : (rs) ) \ : ( m == 1 ? (rs) : (cs) ) ) -/* #define bli_is_row_stored( rs, cs ) \ \ ( bli_abs( cs ) == 1 ) @@ -364,7 +363,6 @@ #define bli_is_col_stored( rs, cs ) \ \ ( bli_abs( rs ) == 1 ) -*/ #define bli_is_row_stored_f( m, n, rs, cs ) \ \ @@ -530,6 +528,25 @@ \ ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3M ) +#define bli_is_ro_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RO ) + +#define bli_is_io_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_IO ) + +#define bli_is_rpi_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RPI ) + +#define bli_is_rih_packed( schema ) \ +\ + ( bli_is_ro_packed( schema ) || \ + bli_is_io_packed( schema ) || \ + bli_is_rpi_packed( schema ) ) + + // return datatype for char diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index d5134da39..832dd9f48 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -197,6 +197,25 @@ #include "bli_scal2jri3s.h" +// -- 4mh/3mh-specific scalar macros -- + +#include "bli_scal2rihs_mxn_diag.h" +#include "bli_scal2rihs_mxn_uplo.h" +#include "bli_setrihs_mxn_diag.h" + +// ro +#include "bli_scal2ros.h" +#include "bli_scal2jros.h" + +// io +#include "bli_scal2ios.h" +#include "bli_scal2jios.h" + +// rpi +#include "bli_scal2rpis.h" +#include "bli_scal2jrpis.h" + + // -- Miscellaneous macros -- diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 3ee95b16a..c45c3f120 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -209,6 +209,12 @@ typedef dcomplex f77_dcomplex; - 100111: packed by 4m column panels - 101010: packed by 3m row panels - 101011: packed by 3m column panels + - 110010: packed real-only row panels + - 110011: packed real-only column panels + - 110110: packed imag-only row panels + - 110111: packed imag-only column panels + - 111010: packed real+imag row panels + - 111011: packed real+imag column panels 22 Packed panel order if upper-stored - 0 == forward order if upper - 1 == reverse order if upper @@ -309,6 +315,9 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_4M ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_3M ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC BLIS_PACK_BIT #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) @@ -318,6 +327,12 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_PACKED_COL_PANELS_4M ( BLIS_PACK_BIT | BLIS_BITVAL_4M | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_3M ( BLIS_PACK_BIT | BLIS_BITVAL_3M | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_3M ( BLIS_PACK_BIT | BLIS_BITVAL_3M | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 @@ -427,6 +442,12 @@ typedef enum BLIS_PACKED_COL_PANELS_4M = BLIS_BITVAL_PACKED_COL_PANELS_4M, BLIS_PACKED_ROW_PANELS_3M = BLIS_BITVAL_PACKED_ROW_PANELS_3M, BLIS_PACKED_COL_PANELS_3M = BLIS_BITVAL_PACKED_COL_PANELS_3M, + BLIS_PACKED_ROW_PANELS_RO = BLIS_BITVAL_PACKED_ROW_PANELS_RO, + BLIS_PACKED_COL_PANELS_RO = BLIS_BITVAL_PACKED_COL_PANELS_RO, + BLIS_PACKED_ROW_PANELS_IO = BLIS_BITVAL_PACKED_ROW_PANELS_IO, + BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO, + BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI, + BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI, } pack_t; @@ -453,6 +474,17 @@ typedef enum } packbuf_t; +// -- micro-kernel implementation type -- + +typedef enum +{ + BLIS_REFERENCE_UKERNEL = 0, + BLIS_VIRTUAL4M_UKERNEL, + BLIS_VIRTUAL3M_UKERNEL, + BLIS_OPTIMIZED_UKERNEL, +} kimpl_t; + + // // -- BLIS misc. structure types ----------------------------------------------- // diff --git a/frame/include/blis.h b/frame/include/blis.h index 446ad27ad..591986893 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -84,6 +84,7 @@ extern "C" { #include "bli_kernel_macro_defs.h" #include "bli_kernel_4m_macro_defs.h" #include "bli_kernel_3m_macro_defs.h" +#include "bli_kernel_rih_macro_defs.h" #include "bli_kernel_post_macro_defs.h" #include "bli_kernel_prototypes.h" @@ -113,6 +114,9 @@ extern "C" { #include "bli_info.h" #include "bli_getopt.h" #include "bli_4m.h" +#include "bli_3m.h" +#include "bli_4mh.h" +#include "bli_3mh.h" // Control tree definitions. #include "bli_cntl.h" diff --git a/frame/include/level0/io/bli_scal2ios.h b/frame/include/level0/io/bli_scal2ios.h new file mode 100644 index 000000000..268f8ebf0 --- /dev/null +++ b/frame/include/level0/io/bli_scal2ios.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyiight (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyiight + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyiight + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2IOS_H +#define BLIS_SCAL2IOS_H + +// scal2ios + +#define bli_cscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_cimag(a) * bli_creal(x) + bli_creal(a) * bli_cimag(x); \ +} + +#define bli_zscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_zimag(a) * bli_zreal(x) + bli_zreal(a) * bli_zimag(x); \ +} + +#define bli_scscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_creal(a) * bli_cimag(x); \ +} + +#define bli_dzscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_zreal(a) * bli_zimag(x); \ +} + +#endif + diff --git a/frame/include/level0/io/bli_scal2jios.h b/frame/include/level0/io/bli_scal2jios.h new file mode 100644 index 000000000..55038b5d3 --- /dev/null +++ b/frame/include/level0/io/bli_scal2jios.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyiight (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyiight + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyiight + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2JIOS_H +#define BLIS_SCAL2JIOS_H + +// scal2jios + +#define bli_cscal2jios( a, x, yi ) \ +{ \ + (yi) = bli_cimag(a) * bli_creal(x) - bli_creal(a) * bli_cimag(x); \ +} + +#define bli_zscal2jios( a, x, yi ) \ +{ \ + (yi) = bli_zimag(a) * bli_zreal(x) - bli_zreal(a) * bli_zimag(x); \ +} + + +#endif + diff --git a/frame/include/level0/rih/bli_scal2rihs_mxn_diag.h b/frame/include/level0/rih/bli_scal2rihs_mxn_diag.h new file mode 100644 index 000000000..39f270820 --- /dev/null +++ b/frame/include/level0/rih/bli_scal2rihs_mxn_diag.h @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2RIHS_MXN_DIAG_H +#define BLIS_SCAL2RIHS_MXN_DIAG_H + +// scal2rihs_mxn_diag + +#define bli_cscscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal2ros( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal2ios( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal2rpis( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#define bli_zdzscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal2ros( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal2ios( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal2rpis( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/rih/bli_scal2rihs_mxn_uplo.h b/frame/include/level0/rih/bli_scal2rihs_mxn_uplo.h new file mode 100644 index 000000000..38423dfcb --- /dev/null +++ b/frame/include/level0/rih/bli_scal2rihs_mxn_uplo.h @@ -0,0 +1,348 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2RIHS_MXN_UPLO_H +#define BLIS_SCAL2RIHS_MXN_UPLO_H + +// scal2rihs_mxn_uplo + +#define bli_cscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ +} + +#define bli_zscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/rih/bli_setrihs_mxn_diag.h b/frame/include/level0/rih/bli_setrihs_mxn_diag.h new file mode 100644 index 000000000..3fe2a8215 --- /dev/null +++ b/frame/include/level0/rih/bli_setrihs_mxn_diag.h @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SETRIHS_MXN_DIAG_H +#define BLIS_SETRIHS_MXN_DIAG_H + +// setrihs_mxn_diag + +#define bli_csetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ +{ \ + const float a_r = bli_zreal( *a ); \ + const float a_i = bli_zimag( *a ); \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scopys( (a_r), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scopys( (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_sadd3s( (a_r), \ + (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#define bli_zsetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ +{ \ + const double a_r = bli_zreal( *a ); \ + const double a_i = bli_zimag( *a ); \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dcopys( (a_r), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dcopys( (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dadd3s( (a_r), \ + (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/ro/bli_scal2jros.h b/frame/include/level0/ro/bli_scal2jros.h new file mode 100644 index 000000000..40cc87044 --- /dev/null +++ b/frame/include/level0/ro/bli_scal2jros.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2JROS_H +#define BLIS_SCAL2JROS_H + +// scal2jros + +#define bli_cscal2jros( a, x, yr ) \ +{ \ + (yr) = bli_creal(a) * bli_creal(x) + bli_cimag(a) * bli_cimag(x); \ +} + +#define bli_zscal2jros( a, x, yr ) \ +{ \ + (yr) = bli_zreal(a) * bli_zreal(x) + bli_zimag(a) * bli_zimag(x); \ +} + +#endif + diff --git a/frame/include/level0/ro/bli_scal2ros.h b/frame/include/level0/ro/bli_scal2ros.h new file mode 100644 index 000000000..95b48c198 --- /dev/null +++ b/frame/include/level0/ro/bli_scal2ros.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2ROS_H +#define BLIS_SCAL2ROS_H + +// scal2ros + +#define bli_cscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_creal(a) * bli_creal(x) - bli_cimag(a) * bli_cimag(x); \ +} + +#define bli_zscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_zreal(a) * bli_zreal(x) - bli_zimag(a) * bli_zimag(x); \ +} + +#define bli_scscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_creal(a) * bli_creal(x); \ +} + +#define bli_dzscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_zreal(a) * bli_zreal(x); \ +} + + +#endif + diff --git a/frame/include/level0/rpi/bli_scal2jrpis.h b/frame/include/level0/rpi/bli_scal2jrpis.h new file mode 100644 index 000000000..bf930ad3f --- /dev/null +++ b/frame/include/level0/rpi/bli_scal2jrpis.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyrpiight (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyrpiight + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyrpiight + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2JRPIS_H +#define BLIS_SCAL2JRPIS_H + +// scal2jrpis + +#define bli_cscal2jrpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ + (bli_cimag(a)-bli_creal(a)) * bli_cimag(x); \ +} + +#define bli_zscal2jrpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ + (bli_zimag(a)-bli_zreal(a)) * bli_zimag(x); \ +} + +#endif + diff --git a/frame/include/level0/rpi/bli_scal2rpis.h b/frame/include/level0/rpi/bli_scal2rpis.h new file mode 100644 index 000000000..6c4ee8857 --- /dev/null +++ b/frame/include/level0/rpi/bli_scal2rpis.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2RPIS_H +#define BLIS_SCAL2RPIS_H + +// scal2rpis + +#define bli_cscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ + (bli_creal(a)-bli_cimag(a)) * bli_cimag(x); \ +} + +#define bli_zscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ + (bli_zreal(a)-bli_zimag(a)) * bli_zimag(x); \ +} + +#define bli_scscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = bli_creal(a) * bli_creal(x) + \ + bli_creal(a) * bli_cimag(x); \ +} + +#define bli_dzscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = bli_zreal(a) * bli_zreal(x) + \ + bli_zreal(a) * bli_zimag(x); \ +} + + +#endif + diff --git a/testsuite/input.general b/testsuite/input.general index 0f52b46c7..df6b80442 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -23,6 +23,11 @@ sdcz # Datatype(s) to test: 100 # Problem size: first to test 300 # Problem size: maximum to test 100 # Problem size: increment between experiments + # Complex level-3 implementations +0 # 3mh ('1' = enable; '0' = disable) +0 # 3m ('1' = enable; '0' = disable) +0 # 4mh ('1' = enable; '0' = disable) +1 # 4m ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking i # Reaction to test failure: diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 0fadba172..7e749582a 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -51,6 +51,12 @@ char libblis_test_store_chars[ NUM_OPERAND_TYPES ][ MAX_STORE_VALS_PER_TYPE + 1 char libblis_test_param_chars[ NUM_PARAM_TYPES ][ MAX_PARAM_VALS_PER_TYPE + 1 ]; +//#define _3MH +//#define _4MH +//#define _3M +//#define _4M + + int main( int argc, char** argv ) { test_params_t params; @@ -59,6 +65,36 @@ int main( int argc, char** argv ) // Initialize libblis. bli_init(); + // Experimental. Set the complex implementations. +/* +#if defined _3MH + bli_3mh_enable(); + bli_3m_enable(); + bli_4mh_disable(); + bli_4m_enable(); +#elif defined _3M + bli_3mh_disable(); + bli_3m_enable(); + bli_4mh_enable(); + bli_4m_enable(); +#elif defined _4MH + bli_3mh_disable(); + bli_3m_disable(); + bli_4mh_enable(); + bli_4m_enable(); +#elif defined _4M + bli_3mh_disable(); + bli_3m_disable(); + bli_4mh_disable(); + bli_4m_enable(); +#else + bli_3mh_disable(); + bli_3m_disable(); + bli_4mh_disable(); + bli_4m_enable(); +#endif +*/ + // Initialize some strings. libblis_test_init_strings(); @@ -377,6 +413,22 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->p_inc) ); + // Read whether to enable 3mh. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_3mh) ); + + // Read whether to enable 3m. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_3m) ); + + // Read whether to enable 4mh. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_4mh) ); + + // Read whether to enable 4m. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_4m) ); + // Read the requested error-checking level. libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->error_checking_level) ); @@ -404,6 +456,16 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params // Close the file. fclose( input_stream ); + // Enable/disable the alternative complex implementations. + if ( params->enable_3mh ) bli_3mh_enable(); + else bli_3mh_disable(); + if ( params->enable_3m ) bli_3m_enable(); + else bli_3m_disable(); + if ( params->enable_4mh ) bli_4mh_enable(); + else bli_4mh_disable(); + if ( params->enable_4m ) bli_4m_enable(); + else bli_4m_disable(); + // Output the parameter struct. libblis_test_output_params_struct( stdout, params ); } @@ -595,108 +657,185 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS kernel header ---\n" ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "floating-point types s d c z \n" ); - libblis_test_fprintf_c( os, " sizes (bytes) %5u %5u %5u %5u\n", sizeof(float), + libblis_test_fprintf_c( os, "floating-point types s d c z \n" ); + libblis_test_fprintf_c( os, " sizes (bytes) %7u %7u %7u %7u\n", sizeof(float), sizeof(double), sizeof(scomplex), sizeof(dcomplex) ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "complex via 4m\n" ); - libblis_test_fprintf_c( os, " enabled for scomplex? %d\n", ( int )bli_info_get_enable_scomplex_via_4m() ); - libblis_test_fprintf_c( os, " enabled for dcomplex? %d\n", ( int )bli_info_get_enable_dcomplex_via_4m() ); - libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 def cache blkszes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 def cache blkszes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_mc_s(), ( int )bli_info_get_default_mc_d(), ( int )bli_info_get_default_mc_c(), ( int )bli_info_get_default_mc_z() ); - libblis_test_fprintf_c( os, " k dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " k dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_kc_s(), ( int )bli_info_get_default_kc_d(), ( int )bli_info_get_default_kc_c(), ( int )bli_info_get_default_kc_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_nc_s(), ( int )bli_info_get_default_nc_d(), ( int )bli_info_get_default_nc_c(), ( int )bli_info_get_default_nc_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 max cache blkszes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 max cache blkszes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_maximum_mc_s(), ( int )bli_info_get_maximum_mc_d(), ( int )bli_info_get_maximum_mc_c(), ( int )bli_info_get_maximum_mc_z() ); - libblis_test_fprintf_c( os, " k dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " k dimension %7d %7d %7d %7d\n", ( int )bli_info_get_maximum_kc_s(), ( int )bli_info_get_maximum_kc_d(), ( int )bli_info_get_maximum_kc_c(), ( int )bli_info_get_maximum_kc_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_maximum_nc_s(), ( int )bli_info_get_maximum_nc_d(), ( int )bli_info_get_maximum_nc_c(), ( int )bli_info_get_maximum_nc_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 register blocksizes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 register blocksizes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_mr_s(), ( int )bli_info_get_default_mr_d(), ( int )bli_info_get_default_mr_c(), ( int )bli_info_get_default_mr_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_nr_s(), ( int )bli_info_get_default_nr_d(), ( int )bli_info_get_default_nr_c(), ( int )bli_info_get_default_nr_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 pack register blksz s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 pack register blksz s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_packdim_mr_s(), ( int )bli_info_get_packdim_mr_d(), ( int )bli_info_get_packdim_mr_c(), ( int )bli_info_get_packdim_mr_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_packdim_nr_s(), ( int )bli_info_get_packdim_nr_d(), ( int )bli_info_get_packdim_nr_c(), ( int )bli_info_get_packdim_nr_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_l2_mc_s(), ( int )bli_info_get_default_l2_mc_d(), ( int )bli_info_get_default_l2_mc_c(), ( int )bli_info_get_default_l2_mc_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_l2_nc_s(), ( int )bli_info_get_default_l2_nc_d(), ( int )bli_info_get_default_l2_nc_c(), ( int )bli_info_get_default_l2_nc_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-1f fusing factors s d c z \n" ); - libblis_test_fprintf_c( os, " default %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-1f fusing factors s d c z \n" ); + libblis_test_fprintf_c( os, " default %7d %7d %7d %7d\n", ( int )bli_info_get_default_l1f_fuse_fac_s(), ( int )bli_info_get_default_l1f_fuse_fac_d(), ( int )bli_info_get_default_l1f_fuse_fac_c(), ( int )bli_info_get_default_l1f_fuse_fac_z() ); - libblis_test_fprintf_c( os, " axpyf %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " axpyf %7d %7d %7d %7d\n", ( int )bli_info_get_axpyf_fuse_fac_s(), ( int )bli_info_get_axpyf_fuse_fac_d(), ( int )bli_info_get_axpyf_fuse_fac_c(), ( int )bli_info_get_axpyf_fuse_fac_z() ); - libblis_test_fprintf_c( os, " dotxf %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " dotxf %7d %7d %7d %7d\n", ( int )bli_info_get_dotxf_fuse_fac_s(), ( int )bli_info_get_dotxf_fuse_fac_d(), ( int )bli_info_get_dotxf_fuse_fac_c(), ( int )bli_info_get_dotxf_fuse_fac_z() ); - libblis_test_fprintf_c( os, " dotxaxpyf %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " dotxaxpyf %7d %7d %7d %7d\n", ( int )bli_info_get_dotxaxpyf_fuse_fac_s(), ( int )bli_info_get_dotxaxpyf_fuse_fac_d(), ( int )bli_info_get_dotxaxpyf_fuse_fac_c(), ( int )bli_info_get_dotxaxpyf_fuse_fac_z() ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "micro-kernel types s d c z\n" ); + libblis_test_fprintf_c( os, " gemm %7s %7s %7s %7s\n", + bli_info_get_gemm_ukr_type( BLIS_FLOAT ), + bli_info_get_gemm_ukr_type( BLIS_DOUBLE ), + bli_info_get_gemm_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_gemm_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " gemmtrsm_l %7s %7s %7s %7s\n", + bli_info_get_gemmtrsm_l_ukr_type( BLIS_FLOAT ), + bli_info_get_gemmtrsm_l_ukr_type( BLIS_DOUBLE ), + bli_info_get_gemmtrsm_l_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_gemmtrsm_l_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " gemmtrsm_u %7s %7s %7s %7s\n", + bli_info_get_gemmtrsm_u_ukr_type( BLIS_FLOAT ), + bli_info_get_gemmtrsm_u_ukr_type( BLIS_DOUBLE ), + bli_info_get_gemmtrsm_u_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_gemmtrsm_u_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trsm_l %7s %7s %7s %7s\n", + bli_info_get_trsm_l_ukr_type( BLIS_FLOAT ), + bli_info_get_trsm_l_ukr_type( BLIS_DOUBLE ), + bli_info_get_trsm_l_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_trsm_l_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trsm_u %7s %7s %7s %7s\n", + bli_info_get_trsm_u_ukr_type( BLIS_FLOAT ), + bli_info_get_trsm_u_ukr_type( BLIS_DOUBLE ), + bli_info_get_trsm_u_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_trsm_u_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "--- BLIS implementation details ---\n" ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "level-3 implementations s d c z\n" ); + libblis_test_fprintf_c( os, " gemm %7s %7s %7s %7s\n", + bli_info_get_gemm_impl_string( BLIS_FLOAT ), + bli_info_get_gemm_impl_string( BLIS_DOUBLE ), + bli_info_get_gemm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_gemm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " hemm %7s %7s %7s %7s\n", + bli_info_get_hemm_impl_string( BLIS_FLOAT ), + bli_info_get_hemm_impl_string( BLIS_DOUBLE ), + bli_info_get_hemm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_hemm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " herk %7s %7s %7s %7s\n", + bli_info_get_herk_impl_string( BLIS_FLOAT ), + bli_info_get_herk_impl_string( BLIS_DOUBLE ), + bli_info_get_herk_impl_string( BLIS_SCOMPLEX ), + bli_info_get_herk_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " her2k %7s %7s %7s %7s\n", + bli_info_get_her2k_impl_string( BLIS_FLOAT ), + bli_info_get_her2k_impl_string( BLIS_DOUBLE ), + bli_info_get_her2k_impl_string( BLIS_SCOMPLEX ), + bli_info_get_her2k_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " symm %7s %7s %7s %7s\n", + bli_info_get_symm_impl_string( BLIS_FLOAT ), + bli_info_get_symm_impl_string( BLIS_DOUBLE ), + bli_info_get_symm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_symm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " syrk %7s %7s %7s %7s\n", + bli_info_get_syrk_impl_string( BLIS_FLOAT ), + bli_info_get_syrk_impl_string( BLIS_DOUBLE ), + bli_info_get_syrk_impl_string( BLIS_SCOMPLEX ), + bli_info_get_syrk_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " syr2k %7s %7s %7s %7s\n", + bli_info_get_syr2k_impl_string( BLIS_FLOAT ), + bli_info_get_syr2k_impl_string( BLIS_DOUBLE ), + bli_info_get_syr2k_impl_string( BLIS_SCOMPLEX ), + bli_info_get_syr2k_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trmm %7s %7s %7s %7s\n", + bli_info_get_trmm_impl_string( BLIS_FLOAT ), + bli_info_get_trmm_impl_string( BLIS_DOUBLE ), + bli_info_get_trmm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_trmm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trmm3 %7s %7s %7s %7s\n", + bli_info_get_trmm3_impl_string( BLIS_FLOAT ), + bli_info_get_trmm3_impl_string( BLIS_DOUBLE ), + bli_info_get_trmm3_impl_string( BLIS_SCOMPLEX ), + bli_info_get_trmm3_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trsm %7s %7s %7s %7s\n", + bli_info_get_trsm_impl_string( BLIS_FLOAT ), + bli_info_get_trsm_impl_string( BLIS_DOUBLE ), + bli_info_get_trsm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_trsm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf( os, "\n" ); // Output the contents of the param struct. @@ -719,6 +858,10 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "problem size: first to test %u\n", params->p_first ); libblis_test_fprintf_c( os, "problem size: max to test %u\n", params->p_max ); libblis_test_fprintf_c( os, "problem size increment %u\n", params->p_inc ); + libblis_test_fprintf_c( os, "enable 3mh? %u\n", params->enable_3mh ); + libblis_test_fprintf_c( os, "enable 3m? %u\n", params->enable_3m ); + libblis_test_fprintf_c( os, "enable 4mh? %u\n", params->enable_4mh ); + libblis_test_fprintf_c( os, "enable 4m? %u\n", params->enable_4m ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); libblis_test_fprintf_c( os, "output in matlab format? %u\n", params->output_matlab_format ); diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 1609b0c66..31dbf429d 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -160,6 +160,10 @@ typedef struct unsigned int p_first; unsigned int p_max; unsigned int p_inc; + unsigned int enable_3mh; + unsigned int enable_3m; + unsigned int enable_4mh; + unsigned int enable_4m; char reaction_to_failure; unsigned int output_matlab_format; unsigned int output_files;