From e2e7cb2fbe615be4d375bc2dce88d03d98fadc9e Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 13 Dec 2012 18:17:54 -0600 Subject: [PATCH] Expanded reference packm/unpackm kernel set to 16. Details: - Added 10xk, 12xk, 14xk, and 16xk reference kernels for packm and unpackm. - Updated bl2_[un]packm_cxk() to silently use scal2m if "out of range" kernel size is requested. (Thanks to Tyler for finding this bug.) - Updated bl2_kernel.h to contain new _KERNEL definitions, according to above changes, for 'reference' and 'clarksville' configurations. - Updated CHANGELOG. - Removed "output*.m" from .gitignore. --- .gitignore | 1 - CHANGELOG | 8 +- config/clarksville/bl2_kernel.h | 8 + config/reference/bl2_kernel.h | 8 + frame/1m/packm/bl2_packm_cxk.c | 111 ++++++++---- frame/1m/packm/bl2_packm_cxk.h | 4 + frame/1m/packm/ukernels/bl2_packm_ref_10xk.c | 139 +++++++++++++++ frame/1m/packm/ukernels/bl2_packm_ref_10xk.h | 46 +++++ frame/1m/packm/ukernels/bl2_packm_ref_12xk.c | 147 ++++++++++++++++ frame/1m/packm/ukernels/bl2_packm_ref_12xk.h | 46 +++++ frame/1m/packm/ukernels/bl2_packm_ref_14xk.c | 155 +++++++++++++++++ frame/1m/packm/ukernels/bl2_packm_ref_14xk.h | 46 +++++ frame/1m/packm/ukernels/bl2_packm_ref_16xk.c | 163 ++++++++++++++++++ frame/1m/packm/ukernels/bl2_packm_ref_16xk.h | 46 +++++ frame/1m/unpackm/bl2_unpackm_cxk.c | 111 ++++++++---- frame/1m/unpackm/bl2_unpackm_cxk.h | 4 + .../unpackm/ukernels/bl2_unpackm_ref_10xk.c | 139 +++++++++++++++ .../unpackm/ukernels/bl2_unpackm_ref_10xk.h | 46 +++++ .../unpackm/ukernels/bl2_unpackm_ref_12xk.c | 147 ++++++++++++++++ .../unpackm/ukernels/bl2_unpackm_ref_12xk.h | 46 +++++ .../unpackm/ukernels/bl2_unpackm_ref_14xk.c | 155 +++++++++++++++++ .../unpackm/ukernels/bl2_unpackm_ref_14xk.h | 46 +++++ .../unpackm/ukernels/bl2_unpackm_ref_16xk.c | 163 ++++++++++++++++++ .../unpackm/ukernels/bl2_unpackm_ref_16xk.h | 46 +++++ version | 2 +- 25 files changed, 1756 insertions(+), 77 deletions(-) create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_10xk.c create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_10xk.h create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_12xk.c create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_12xk.h create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_14xk.c create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_14xk.h create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_16xk.c create mode 100644 frame/1m/packm/ukernels/bl2_packm_ref_16xk.h create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.c create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.h create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.c create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.h create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.c create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.h create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.c create mode 100644 frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.h diff --git a/.gitignore b/.gitignore index 13899d147..bc5798dd0 100644 --- a/.gitignore +++ b/.gitignore @@ -29,5 +29,4 @@ obj # -- misc. -- -output*.m diff --git a/CHANGELOG b/CHANGELOG index 3efd37569..c832cf1f0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,10 @@ -commit 7ad4ebef38b8e6eea9b6091844ba7294ec870271 (HEAD, tag: 0.0.1, origin/master, master) +commit 17455a8bce038dd570356ab0c5c11d9a89f20248 (HEAD, origin/master, master) +Author: Field G. Van Zee +Date: Mon Dec 10 17:23:32 2012 -0600 + + Minor updates towards to 0.0.1. + +commit 7ad4ebef38b8e6eea9b6091844ba7294ec870271 (tag: 0.0.1) Author: Field G. Van Zee Date: Mon Dec 10 16:18:40 2012 -0600 diff --git a/config/clarksville/bl2_kernel.h b/config/clarksville/bl2_kernel.h index ef0d79f36..cd34dac7e 100644 --- a/config/clarksville/bl2_kernel.h +++ b/config/clarksville/bl2_kernel.h @@ -65,6 +65,10 @@ #define PACKM_4XK_KERNEL packm_ref_4xk #define PACKM_6XK_KERNEL packm_ref_6xk #define PACKM_8XK_KERNEL packm_ref_8xk +#define PACKM_10XK_KERNEL packm_ref_10xk +#define PACKM_12XK_KERNEL packm_ref_12xk +#define PACKM_14XK_KERNEL packm_ref_14xk +#define PACKM_16XK_KERNEL packm_ref_16xk // -- unpackm -- @@ -72,6 +76,10 @@ #define UNPACKM_4XK_KERNEL unpackm_ref_4xk #define UNPACKM_6XK_KERNEL unpackm_ref_6xk #define UNPACKM_8XK_KERNEL unpackm_ref_8xk +#define UNPACKM_10XK_KERNEL unpackm_ref_10xk +#define UNPACKM_12XK_KERNEL unpackm_ref_12xk +#define UNPACKM_14XK_KERNEL unpackm_ref_14xk +#define UNPACKM_16XK_KERNEL unpackm_ref_16xk diff --git a/config/reference/bl2_kernel.h b/config/reference/bl2_kernel.h index aee428524..61dea14be 100644 --- a/config/reference/bl2_kernel.h +++ b/config/reference/bl2_kernel.h @@ -61,6 +61,10 @@ #define PACKM_4XK_KERNEL packm_ref_4xk #define PACKM_6XK_KERNEL packm_ref_6xk #define PACKM_8XK_KERNEL packm_ref_8xk +#define PACKM_10XK_KERNEL packm_ref_10xk +#define PACKM_12XK_KERNEL packm_ref_12xk +#define PACKM_14XK_KERNEL packm_ref_14xk +#define PACKM_16XK_KERNEL packm_ref_16xk // -- unpackm -- @@ -68,6 +72,10 @@ #define UNPACKM_4XK_KERNEL unpackm_ref_4xk #define UNPACKM_6XK_KERNEL unpackm_ref_6xk #define UNPACKM_8XK_KERNEL unpackm_ref_8xk +#define UNPACKM_10XK_KERNEL unpackm_ref_10xk +#define UNPACKM_12XK_KERNEL unpackm_ref_12xk +#define UNPACKM_14XK_KERNEL unpackm_ref_14xk +#define UNPACKM_16XK_KERNEL unpackm_ref_16xk diff --git a/frame/1m/packm/bl2_packm_cxk.c b/frame/1m/packm/bl2_packm_cxk.c index 1c017a978..246449d09 100644 --- a/frame/1m/packm/bl2_packm_cxk.c +++ b/frame/1m/packm/bl2_packm_cxk.c @@ -44,25 +44,22 @@ typedef void (*FUNCPTR_T)( void* p ); +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 18 #undef GENARRAY -#define GENARRAY( kername2, kername4, kername6, kername8 ) \ +#define GENARRAY( kername2, kername4, kername6, kername8, \ + kername10, kername12, kername14, kername16 ) \ \ -static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ +static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = \ { \ /* panel width = 0 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 1 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 2 */ \ { \ @@ -73,10 +70,7 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 3 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 4 */ \ { \ @@ -87,10 +81,7 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 5 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 6 */ \ { \ @@ -101,10 +92,7 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 7 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 8 */ \ { \ @@ -115,17 +103,62 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 9 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 10 */ \ + { \ + PASTEMAC(s,kername10), \ + PASTEMAC(c,kername10), \ + PASTEMAC(d,kername10), \ + PASTEMAC(z,kername10), \ + }, \ + /* panel width = 11 */ \ + { \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 12 */ \ + { \ + PASTEMAC(s,kername12), \ + PASTEMAC(c,kername12), \ + PASTEMAC(d,kername12), \ + PASTEMAC(z,kername12), \ + }, \ + /* panel width = 13 */ \ + { \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 14 */ \ + { \ + PASTEMAC(s,kername14), \ + PASTEMAC(c,kername14), \ + PASTEMAC(d,kername14), \ + PASTEMAC(z,kername14), \ + }, \ + /* panel width = 15 */ \ + { \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 16 */ \ + { \ + PASTEMAC(s,kername16), \ + PASTEMAC(c,kername16), \ + PASTEMAC(d,kername16), \ + PASTEMAC(z,kername16), \ + }, \ + /* panel width = 17 */ \ + { \ + NULL, NULL, NULL, NULL, \ } \ }; GENARRAY( PACKM_2XK_KERNEL, PACKM_4XK_KERNEL, PACKM_6XK_KERNEL, - PACKM_8XK_KERNEL ) + PACKM_8XK_KERNEL, + PACKM_10XK_KERNEL, + PACKM_12XK_KERNEL, + PACKM_14XK_KERNEL, + PACKM_16XK_KERNEL ) @@ -144,15 +177,6 @@ void PASTEMAC(ch,opname)( \ dim_t panel_dim; \ num_t dt; \ FUNCPTR_T f; \ -\ - /* The panel dimension is always equal to the leading dimension of p. */ \ - panel_dim = ldp; \ -\ - /* Acquire the datatype for the current function. */ \ - dt = PASTEMAC(ch,type); \ -\ - /* Index into the array to extract the correct function pointer. */ \ - f = ftypes[panel_dim][dt]; \ \ /* If the panel dimension is unit, then we recognize that this allows the kernel to reduce to a copyv, so we call that kernel directly. */ \ @@ -164,6 +188,19 @@ void PASTEMAC(ch,opname)( \ p, 1 ); \ return; \ } \ +\ + /* The panel dimension is always equal to the leading dimension of p. */ \ + panel_dim = ldp; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes[panel_dim][dt]; \ + else f = NULL; \ \ /* If there exists a kernel implementation for the panel dimension provided, and the "width" of the panel is equal to the leading @@ -180,7 +217,7 @@ void PASTEMAC(ch,opname)( \ } \ else \ { \ - /* Treat the panel as m x n and column-stored (unit row stride).*/ \ + /* Treat the panel as m x n and column-stored (unit row stride). */ \ PASTEMAC3(ch,ch,ch,scal2m)( 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ diff --git a/frame/1m/packm/bl2_packm_cxk.h b/frame/1m/packm/bl2_packm_cxk.h index f2ce9a86f..53b0babf2 100644 --- a/frame/1m/packm/bl2_packm_cxk.h +++ b/frame/1m/packm/bl2_packm_cxk.h @@ -37,6 +37,10 @@ #include "bl2_packm_ref_4xk.h" #include "bl2_packm_ref_6xk.h" #include "bl2_packm_ref_8xk.h" +#include "bl2_packm_ref_10xk.h" +#include "bl2_packm_ref_12xk.h" +#include "bl2_packm_ref_14xk.h" +#include "bl2_packm_ref_16xk.h" #undef GENTPROT diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_10xk.c b/frame/1m/packm/ukernels/bl2_packm_ref_10xk.c new file mode 100644 index 000000000..3b4d9d7d3 --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_10xk.c @@ -0,0 +1,139 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ) \ +{ \ + const inc_t ldp = 10; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( packm_ref_10xk, packm_ref_10xk ) + diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_10xk.h b/frame/1m/packm/ukernels/bl2_packm_ref_10xk.h new file mode 100644 index 000000000..160772667 --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_10xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ); + +INSERT_GENTPROT_BASIC( packm_ref_10xk ) diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_12xk.c b/frame/1m/packm/ukernels/bl2_packm_ref_12xk.c new file mode 100644 index 000000000..a503dddf4 --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_12xk.c @@ -0,0 +1,147 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ) \ +{ \ + const inc_t ldp = 12; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( packm_ref_12xk, packm_ref_12xk ) + diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_12xk.h b/frame/1m/packm/ukernels/bl2_packm_ref_12xk.h new file mode 100644 index 000000000..a022d85bb --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_12xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ); + +INSERT_GENTPROT_BASIC( packm_ref_12xk ) diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_14xk.c b/frame/1m/packm/ukernels/bl2_packm_ref_14xk.c new file mode 100644 index 000000000..997c167ad --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_14xk.c @@ -0,0 +1,155 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ) \ +{ \ + const inc_t ldp = 14; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( packm_ref_14xk, packm_ref_14xk ) + diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_14xk.h b/frame/1m/packm/ukernels/bl2_packm_ref_14xk.h new file mode 100644 index 000000000..c3a18c8a0 --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_14xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ); + +INSERT_GENTPROT_BASIC( packm_ref_14xk ) diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_16xk.c b/frame/1m/packm/ukernels/bl2_packm_ref_16xk.c new file mode 100644 index 000000000..b6dcc8fda --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_16xk.c @@ -0,0 +1,163 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ) \ +{ \ + const inc_t ldp = 16; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 14*inca), *(pi1 + 14) ); \ + PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 15*inca), *(pi1 + 15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 14*inca), *(pi1 + 14) ); \ + PASTEMAC2(ch,ch,copys)( *(alpha1 + 15*inca), *(pi1 + 15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 14*inca), *(pi1 + 14) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 15*inca), *(pi1 + 15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 14*inca), *(pi1 + 14) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 15*inca), *(pi1 + 15) ); \ +\ + alpha1 += lda; \ + pi1 += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( packm_ref_16xk, packm_ref_16xk ) + diff --git a/frame/1m/packm/ukernels/bl2_packm_ref_16xk.h b/frame/1m/packm/ukernels/bl2_packm_ref_16xk.h new file mode 100644 index 000000000..c07a03c23 --- /dev/null +++ b/frame/1m/packm/ukernels/bl2_packm_ref_16xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p \ + ); + +INSERT_GENTPROT_BASIC( packm_ref_16xk ) diff --git a/frame/1m/unpackm/bl2_unpackm_cxk.c b/frame/1m/unpackm/bl2_unpackm_cxk.c index 9dc42580d..094369b4d 100644 --- a/frame/1m/unpackm/bl2_unpackm_cxk.c +++ b/frame/1m/unpackm/bl2_unpackm_cxk.c @@ -44,25 +44,22 @@ typedef void (*FUNCPTR_T)( void* a, inc_t inca, inc_t lda ); +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 18 #undef GENARRAY -#define GENARRAY( kername2, kername4, kername6, kername8 ) \ +#define GENARRAY( kername2, kername4, kername6, kername8, \ + kername10, kername12, kername14, kername16 ) \ \ -static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ +static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = \ { \ /* panel width = 0 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 1 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 2 */ \ { \ @@ -73,10 +70,7 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 3 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 4 */ \ { \ @@ -87,10 +81,7 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 5 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 6 */ \ { \ @@ -101,10 +92,7 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 7 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ }, \ /* panel width = 8 */ \ { \ @@ -115,17 +103,62 @@ static FUNCPTR_T ftypes[10][BLIS_NUM_FP_TYPES] = \ }, \ /* panel width = 9 */ \ { \ - NULL, \ - NULL, \ - NULL, \ - NULL, \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 10 */ \ + { \ + PASTEMAC(s,kername10), \ + PASTEMAC(c,kername10), \ + PASTEMAC(d,kername10), \ + PASTEMAC(z,kername10), \ + }, \ + /* panel width = 11 */ \ + { \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 12 */ \ + { \ + PASTEMAC(s,kername12), \ + PASTEMAC(c,kername12), \ + PASTEMAC(d,kername12), \ + PASTEMAC(z,kername12), \ + }, \ + /* panel width = 13 */ \ + { \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 14 */ \ + { \ + PASTEMAC(s,kername14), \ + PASTEMAC(c,kername14), \ + PASTEMAC(d,kername14), \ + PASTEMAC(z,kername14), \ + }, \ + /* panel width = 15 */ \ + { \ + NULL, NULL, NULL, NULL, \ + }, \ + /* panel width = 16 */ \ + { \ + PASTEMAC(s,kername16), \ + PASTEMAC(c,kername16), \ + PASTEMAC(d,kername16), \ + PASTEMAC(z,kername16), \ + }, \ + /* panel width = 17 */ \ + { \ + NULL, NULL, NULL, NULL, \ } \ }; GENARRAY( UNPACKM_2XK_KERNEL, UNPACKM_4XK_KERNEL, UNPACKM_6XK_KERNEL, - UNPACKM_8XK_KERNEL ) + UNPACKM_8XK_KERNEL, + UNPACKM_10XK_KERNEL, + UNPACKM_12XK_KERNEL, + UNPACKM_14XK_KERNEL, + UNPACKM_16XK_KERNEL ) @@ -145,15 +178,6 @@ void PASTEMAC(ch,opname)( \ dim_t panel_dim; \ num_t dt; \ FUNCPTR_T f; \ -\ - /* The panel dimension is always equal to the leading dimension of p. */ \ - panel_dim = ldp; \ -\ - /* Acquire the datatype for the current function. */ \ - dt = PASTEMAC(ch,type); \ -\ - /* Index into the array to extract the correct function pointer. */ \ - f = ftypes[panel_dim][dt]; \ \ /* If the panel dimension is unit, then we recognize that this allows the kernel to reduce to a copyv, so we call that kernel directly. */ \ @@ -165,6 +189,19 @@ void PASTEMAC(ch,opname)( \ a, lda ); \ return; \ } \ +\ + /* The panel dimension is always equal to the leading dimension of p. */ \ + panel_dim = ldp; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes[panel_dim][dt]; \ + else f = NULL; \ \ /* If there exists a kernel implementation for the panel dimension provided, and the "width" of the panel is equal to the leading @@ -181,7 +218,7 @@ void PASTEMAC(ch,opname)( \ } \ else \ { \ - /* Treat the panel as m x n and column-stored (unit row stride).*/ \ + /* Treat the panel as m x n and column-stored (unit row stride). */ \ PASTEMAC3(ch,ch,ch,scal2m)( 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ diff --git a/frame/1m/unpackm/bl2_unpackm_cxk.h b/frame/1m/unpackm/bl2_unpackm_cxk.h index cded002f2..f45e8bc7f 100644 --- a/frame/1m/unpackm/bl2_unpackm_cxk.h +++ b/frame/1m/unpackm/bl2_unpackm_cxk.h @@ -37,6 +37,10 @@ #include "bl2_unpackm_ref_4xk.h" #include "bl2_unpackm_ref_6xk.h" #include "bl2_unpackm_ref_8xk.h" +#include "bl2_unpackm_ref_10xk.h" +#include "bl2_unpackm_ref_12xk.h" +#include "bl2_unpackm_ref_14xk.h" +#include "bl2_unpackm_ref_16xk.h" #undef GENTPROT diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.c b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.c new file mode 100644 index 000000000..a158d5a3d --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.c @@ -0,0 +1,139 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ) \ +{ \ + const inc_t ldp = 10; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict pi1 = p; \ + ctype* restrict alpha1 = a; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( unpackm_ref_10xk, unpackm_ref_10xk ) + diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.h b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.h new file mode 100644 index 000000000..70f0b54d9 --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_10xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ); + +INSERT_GENTPROT_BASIC( unpackm_ref_10xk ) diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.c b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.c new file mode 100644 index 000000000..6d6776514 --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.c @@ -0,0 +1,147 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ) \ +{ \ + const inc_t ldp = 12; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict pi1 = p; \ + ctype* restrict alpha1 = a; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( unpackm_ref_12xk, unpackm_ref_12xk ) + diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.h b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.h new file mode 100644 index 000000000..e4967da11 --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_12xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ); + +INSERT_GENTPROT_BASIC( unpackm_ref_12xk ) diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.c b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.c new file mode 100644 index 000000000..d8b01d88e --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.c @@ -0,0 +1,155 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ) \ +{ \ + const inc_t ldp = 14; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict pi1 = p; \ + ctype* restrict alpha1 = a; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( unpackm_ref_14xk, unpackm_ref_14xk ) + diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.h b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.h new file mode 100644 index 000000000..44815c7fa --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_14xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ); + +INSERT_GENTPROT_BASIC( unpackm_ref_14xk ) diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.c b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.c new file mode 100644 index 000000000..5efa83e30 --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.c @@ -0,0 +1,163 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis2.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ) \ +{ \ + const inc_t ldp = 16; \ +\ + ctype* restrict beta_cast = beta; \ + ctype* restrict pi1 = p; \ + ctype* restrict alpha1 = a; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ + PASTEMAC2(ch,ch,copyjs)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ + PASTEMAC2(ch,ch,copys)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ + else \ + { \ + if ( bl2_is_conj( conjp ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ + PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ +\ + pi1 += ldp; \ + alpha1 += lda; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC( unpackm_ref_16xk, unpackm_ref_16xk ) + diff --git a/frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.h b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.h new file mode 100644 index 000000000..d72bc4c2c --- /dev/null +++ b/frame/1m/unpackm/ukernels/bl2_unpackm_ref_16xk.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conjp, \ + dim_t n, \ + void* beta, \ + void* p, \ + void* a, inc_t inca, inc_t lda \ + ); + +INSERT_GENTPROT_BASIC( unpackm_ref_16xk ) diff --git a/version b/version index 8acdd82b7..48d014e21 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.0.1 +0.0.1-1