From 4cc2b464f29cafbfef9295b073b857fe0752f710 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 15 Aug 2014 11:49:15 -0500 Subject: [PATCH] Reorganized packm ukernels. Details: - Previously, packm micro-kernels were organized by the implied register blocksize (panel dimension) assumed by the kernel, meaning conventional, ri, and ri3 variations of some micro-kernel size were housed in the same file. This commit reorganizes the micro-kernels so that all sizes reside in the same file for each format type (conventional, ri, and ri3). --- frame/1m/packm/bli_packm.h | 9 - frame/1m/packm/bli_packm_cxk.h | 3 + frame/1m/packm/bli_packm_cxk_ri.h | 3 + frame/1m/packm/bli_packm_cxk_ri3.h | 3 + frame/1m/packm/ukernels/bli_packm_ref_10xk.c | 378 ------- frame/1m/packm/ukernels/bli_packm_ref_10xk.h | 63 -- frame/1m/packm/ukernels/bli_packm_ref_12xk.c | 402 ------- frame/1m/packm/ukernels/bli_packm_ref_12xk.h | 63 -- frame/1m/packm/ukernels/bli_packm_ref_14xk.c | 426 -------- frame/1m/packm/ukernels/bli_packm_ref_14xk.h | 63 -- frame/1m/packm/ukernels/bli_packm_ref_16xk.c | 450 -------- frame/1m/packm/ukernels/bli_packm_ref_16xk.h | 63 -- frame/1m/packm/ukernels/bli_packm_ref_2xk.c | 282 ----- frame/1m/packm/ukernels/bli_packm_ref_4xk.c | 306 ------ frame/1m/packm/ukernels/bli_packm_ref_6xk.c | 330 ------ frame/1m/packm/ukernels/bli_packm_ref_8xk.c | 354 ------- frame/1m/packm/ukernels/bli_packm_ref_8xk.h | 63 -- frame/1m/packm/ukernels/bli_packm_ref_cxk.c | 826 +++++++++++++++ ...li_packm_ref_2xk.h => bli_packm_ref_cxk.h} | 23 +- .../1m/packm/ukernels/bli_packm_ref_cxk_ri.c | 946 +++++++++++++++++ ...packm_ref_4xk.h => bli_packm_ref_cxk_ri.h} | 25 +- .../1m/packm/ukernels/bli_packm_ref_cxk_ri3.c | 986 ++++++++++++++++++ ...ackm_ref_6xk.h => bli_packm_ref_cxk_ri3.h} | 25 +- 23 files changed, 2790 insertions(+), 3302 deletions(-) delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_10xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_10xk.h delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_12xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_12xk.h delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_14xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_14xk.h delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_16xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_16xk.h delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_2xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_4xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_6xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_8xk.c delete mode 100644 frame/1m/packm/ukernels/bli_packm_ref_8xk.h create mode 100644 frame/1m/packm/ukernels/bli_packm_ref_cxk.c rename frame/1m/packm/ukernels/{bli_packm_ref_2xk.h => bli_packm_ref_cxk.h} (81%) create mode 100644 frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.c rename frame/1m/packm/ukernels/{bli_packm_ref_4xk.h => bli_packm_ref_cxk_ri.h} (79%) create mode 100644 frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.c rename frame/1m/packm/ukernels/{bli_packm_ref_6xk.h => bli_packm_ref_cxk_ri3.h} (79%) diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 3f68a5bb9..beda85df0 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -53,12 +53,3 @@ #include "bli_packm_cxk_ri.h" #include "bli_packm_cxk_ri3.h" -#include "bli_packm_ref_2xk.h" -#include "bli_packm_ref_4xk.h" -#include "bli_packm_ref_6xk.h" -#include "bli_packm_ref_8xk.h" -#include "bli_packm_ref_10xk.h" -#include "bli_packm_ref_12xk.h" -#include "bli_packm_ref_14xk.h" -#include "bli_packm_ref_16xk.h" - diff --git a/frame/1m/packm/bli_packm_cxk.h b/frame/1m/packm/bli_packm_cxk.h index 38eb6c1bc..921903eba 100644 --- a/frame/1m/packm/bli_packm_cxk.h +++ b/frame/1m/packm/bli_packm_cxk.h @@ -32,6 +32,9 @@ */ +#include "bli_packm_ref_cxk.h" + + #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ diff --git a/frame/1m/packm/bli_packm_cxk_ri.h b/frame/1m/packm/bli_packm_cxk_ri.h index 200b2f532..ef653d2ec 100644 --- a/frame/1m/packm/bli_packm_cxk_ri.h +++ b/frame/1m/packm/bli_packm_cxk_ri.h @@ -32,6 +32,9 @@ */ +#include "bli_packm_ref_cxk_ri.h" + + #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/frame/1m/packm/bli_packm_cxk_ri3.h b/frame/1m/packm/bli_packm_cxk_ri3.h index d45508453..0c3d95a10 100644 --- a/frame/1m/packm/bli_packm_cxk_ri3.h +++ b/frame/1m/packm/bli_packm_cxk_ri3.h @@ -32,6 +32,9 @@ */ +#include "bli_packm_ref_cxk_ri3.h" + + #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/frame/1m/packm/ukernels/bli_packm_ref_10xk.c b/frame/1m/packm/ukernels/bli_packm_ref_10xk.c deleted file mode 100644 index f284529e9..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_10xk.c +++ /dev/null @@ -1,378 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_10xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_10xk.h b/frame/1m/packm/ukernels/bli_packm_ref_10xk.h deleted file mode 100644 index e8c9c7019..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_10xk.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ); - -INSERT_GENTPROT_BASIC( packm_ref_10xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ); - -INSERT_GENTPROTCO_BASIC( packm_ref_10xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_10xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_12xk.c b/frame/1m/packm/ukernels/bli_packm_ref_12xk.c deleted file mode 100644 index 262beef99..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_12xk.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_12xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_12xk.h b/frame/1m/packm/ukernels/bli_packm_ref_12xk.h deleted file mode 100644 index eb187dae4..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_12xk.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ); - -INSERT_GENTPROT_BASIC( packm_ref_12xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ); - -INSERT_GENTPROTCO_BASIC( packm_ref_12xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_12xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_14xk.c b/frame/1m/packm/ukernels/bli_packm_ref_14xk.c deleted file mode 100644 index 4040f0ff6..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_14xk.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_14xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_14xk.h b/frame/1m/packm/ukernels/bli_packm_ref_14xk.h deleted file mode 100644 index f33849340..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_14xk.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ); - -INSERT_GENTPROT_BASIC( packm_ref_14xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ); - -INSERT_GENTPROTCO_BASIC( packm_ref_14xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_14xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_16xk.c b/frame/1m/packm/ukernels/bli_packm_ref_16xk.c deleted file mode 100644 index 8642a75d1..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_16xk.c +++ /dev/null @@ -1,450 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_16xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_16xk.h b/frame/1m/packm/ukernels/bli_packm_ref_16xk.h deleted file mode 100644 index 55c19cbab..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_16xk.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ); - -INSERT_GENTPROT_BASIC( packm_ref_16xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ); - -INSERT_GENTPROTCO_BASIC( packm_ref_16xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_16xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_2xk.c b/frame/1m/packm/ukernels/bli_packm_ref_2xk.c deleted file mode 100644 index 7c48e302d..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_2xk.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_2xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_4xk.c b/frame/1m/packm/ukernels/bli_packm_ref_4xk.c deleted file mode 100644 index 7c37e8dfe..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_4xk.c +++ /dev/null @@ -1,306 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_4xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_6xk.c b/frame/1m/packm/ukernels/bli_packm_ref_6xk.c deleted file mode 100644 index ee03e5216..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_6xk.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_6xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_8xk.c b/frame/1m/packm/ukernels/bli_packm_ref_8xk.c deleted file mode 100644 index ac30c87da..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_8xk.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ) \ -{ \ - ctype* restrict beta_cast = beta; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_ref_8xk ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ) \ -{ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ -\ - ctype* beta_cast = beta; \ - ctype_r* restrict beta_r = ( ctype_r* )beta; \ - ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ -\ - if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ - PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp; \ - pi1_i += ldp; \ - pi1_ri += ldp; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_8xk.h b/frame/1m/packm/ukernels/bli_packm_ref_8xk.h deleted file mode 100644 index 0dd438fc3..000000000 --- a/frame/1m/packm/ukernels/bli_packm_ref_8xk.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ); - -INSERT_GENTPROT_BASIC( packm_ref_8xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ); - -INSERT_GENTPROTCO_BASIC( packm_ref_8xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_8xk_ri3 ) - diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk.c new file mode 100644 index 000000000..54a056ca0 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk.c @@ -0,0 +1,826 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_2xk ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_4xk ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_6xk ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_8xk ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_10xk ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_12xk ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_14xk ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( packm_ref_16xk ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_2xk.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk.h similarity index 81% rename from frame/1m/packm/ukernels/bli_packm_ref_2xk.h rename to frame/1m/packm/ukernels/bli_packm_ref_cxk.h index 8ce60c9e3..c2d78a7d1 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_2xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk.h @@ -44,20 +44,11 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_2xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t psp, inc_t ldp \ - ); - -INSERT_GENTPROTCO_BASIC( packm_ref_2xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_2xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_4xk ) +INSERT_GENTPROT_BASIC( packm_ref_6xk ) +INSERT_GENTPROT_BASIC( packm_ref_8xk ) +INSERT_GENTPROT_BASIC( packm_ref_10xk ) +INSERT_GENTPROT_BASIC( packm_ref_12xk ) +INSERT_GENTPROT_BASIC( packm_ref_14xk ) +INSERT_GENTPROT_BASIC( packm_ref_16xk ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.c new file mode 100644 index 000000000..bd88c0941 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.c @@ -0,0 +1,946 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_4xk.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.h similarity index 79% rename from frame/1m/packm/ukernels/bli_packm_ref_4xk.h rename to frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.h index 72980195d..920f0023f 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_4xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri.h @@ -35,20 +35,6 @@ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ); - -INSERT_GENTPROT_BASIC( packm_ref_4xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ void PASTEMAC(ch,varname)( \ conj_t conja, \ dim_t n, \ @@ -57,7 +43,12 @@ void PASTEMAC(ch,varname)( \ void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROTCO_BASIC( packm_ref_4xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_4xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_2xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_4xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_6xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_8xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_10xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_12xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_14xk_ri ) +INSERT_GENTPROT_BASIC( packm_ref_16xk_ri ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.c new file mode 100644 index 000000000..11253d910 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.c @@ -0,0 +1,986 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri3 ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri3 ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri3 ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri3 ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri3 ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri3 ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri3 ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri3 ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_6xk.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.h similarity index 79% rename from frame/1m/packm/ukernels/bli_packm_ref_6xk.h rename to frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.h index af832df2a..2800291c8 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_6xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_ri3.h @@ -35,20 +35,6 @@ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - dim_t n, \ - void* beta, \ - void* a, inc_t inca, inc_t lda, \ - void* p, inc_t ldp \ - ); - -INSERT_GENTPROT_BASIC( packm_ref_6xk ) - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ void PASTEMAC(ch,varname)( \ conj_t conja, \ dim_t n, \ @@ -57,7 +43,12 @@ void PASTEMAC(ch,varname)( \ void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROTCO_BASIC( packm_ref_6xk_ri ) - -INSERT_GENTPROTCO_BASIC( packm_ref_6xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_2xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_4xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_6xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_8xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_10xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_12xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_14xk_ri3 ) +INSERT_GENTPROT_BASIC( packm_ref_16xk_ri3 )