mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Removed several 'old' directories and files.
Details: - Removed most of the 'old' directories scattered throughout the framework, which includes alternate/half-baked/broken implementations.
This commit is contained in:
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,273 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T copyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_unb_var1);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_unb_var1);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_unb_var1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bl2_copyv_unb_var1( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bl2_obj_datatype( *x );
|
||||
num_t dt_y = bl2_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bl2_obj_conj_status( *x );
|
||||
dim_t n = bl2_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bl2_obj_vector_inc( *x );
|
||||
void* buf_x = bl2_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bl2_obj_vector_inc( *y );
|
||||
void* buf_y = bl2_obj_buffer_at_off( *y );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
ctype_x* chi1; \
|
||||
ctype_y* psi1; \
|
||||
dim_t i; \
|
||||
\
|
||||
if ( bl2_zero_dim1( n ) ) return; \
|
||||
\
|
||||
chi1 = x_cast; \
|
||||
psi1 = y_cast; \
|
||||
\
|
||||
if ( bl2_is_conj( conjx ) ) \
|
||||
{ \
|
||||
for ( i = 0; i < n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,copyjs)( *chi1, *psi1 ); \
|
||||
\
|
||||
chi1 += incx; \
|
||||
psi1 += incy; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( i = 0; i < n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,copys)( *chi1, *psi1 ); \
|
||||
\
|
||||
chi1 += incx; \
|
||||
psi1 += incy; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
//INSERT_GENTFUNC2_BASIC( copyv, copyv_unb_var1 )
|
||||
GENTFUNC2( float, float, s, s, copyv, copyv_unb_var1 )
|
||||
//GENTFUNC2( double, double, d, d, copyv, copyv_unb_var1 )
|
||||
GENTFUNC2( scomplex, scomplex, c, c, copyv, copyv_unb_var1 )
|
||||
GENTFUNC2( dcomplex, dcomplex, z, z, copyv, copyv_unb_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( copyv, copyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( copyv, copyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
void bl2_ddcopyv_unb_var1(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
)
|
||||
{
|
||||
double* restrict x_cast = x;
|
||||
double* restrict y_cast = y;
|
||||
double* restrict chi1;
|
||||
double* restrict psi1;
|
||||
dim_t i;
|
||||
|
||||
//if ( bl2_zero_dim1( n ) ) return;
|
||||
|
||||
if ( n == 2 && incx == 1 && incy == 1 )
|
||||
{
|
||||
*(y_cast + 0) = *(x_cast + 0);
|
||||
*(y_cast + 1) = *(x_cast + 1);
|
||||
return;
|
||||
}
|
||||
else if ( n == 4 && incx == 1 && incy == 1 )
|
||||
{
|
||||
*(y_cast + 0) = *(x_cast + 0);
|
||||
*(y_cast + 1) = *(x_cast + 1);
|
||||
*(y_cast + 2) = *(x_cast + 2);
|
||||
*(y_cast + 3) = *(x_cast + 3);
|
||||
return;
|
||||
}
|
||||
|
||||
if ( incx == 1 &&
|
||||
incy == 1 &&
|
||||
(unsigned long)x % 16 == 0 &&
|
||||
(unsigned long)y % 16 == 0 )
|
||||
{
|
||||
dim_t n_iter = n / 4;
|
||||
dim_t n_left = n % 4;
|
||||
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
"movl %2, %%eax \n\t" // x
|
||||
"movl %4, %%ebx \n\t" // y
|
||||
" \n\t"
|
||||
"movl %3, %%ecx \n\t" // incx
|
||||
"movl %5, %%edx \n\t" // incy
|
||||
" \n\t"
|
||||
"sall $4, %%ecx \n\t" // 16*incx
|
||||
"sall $4, %%edx \n\t" // 16*incy
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movl %0, %%esi \n\t"
|
||||
"testl %%esi, %%esi \n\t"
|
||||
"je .CONSIDERKLEFT \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".LOOPKITER: \n\t"
|
||||
" \n\t"
|
||||
"movapd (%%eax), %%xmm0 \n\t"
|
||||
"movapd %%xmm0, (%%ebx) \n\t"
|
||||
" \n\t"
|
||||
"movapd (%%eax,%%ecx), %%xmm1 \n\t"
|
||||
"movapd %%xmm1, (%%ebx,%%edx) \n\t"
|
||||
" \n\t"
|
||||
"leal (%%eax,%%ecx,2), %%eax \n\t"
|
||||
"leal (%%ebx,%%edx,2), %%ebx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decl %%esi \n\t"
|
||||
"jne .LOOPKITER \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".CONSIDERKLEFT: \n\t"
|
||||
" \n\t"
|
||||
"movl %1, %%esi \n\t"
|
||||
"testl %%esi, %%esi \n\t"
|
||||
"je .DONE \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"sarl $1, %%ecx \n\t" // 8*incx
|
||||
"sarl $1, %%edx \n\t" // 8*incy
|
||||
" \n\t"
|
||||
".LOOPKLEFT: \n\t"
|
||||
" \n\t"
|
||||
"movlpd (%%eax), %%xmm0 \n\t"
|
||||
"movlpd %%xmm0, (%%ebx) \n\t"
|
||||
" \n\t"
|
||||
"addl %%ecx, %%eax \n\t"
|
||||
"addl %%edx, %%ebx \n\t"
|
||||
" \n\t"
|
||||
"decl %%esi \n\t"
|
||||
"jne .LOOPKLEFT \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DONE: \n\t"
|
||||
" \n\t"
|
||||
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"r" (n_iter),
|
||||
"r" (n_left),
|
||||
"m" (x),
|
||||
"m" (incx),
|
||||
"m" (y),
|
||||
"m" (incy)
|
||||
: // register clobber list
|
||||
"esi", "eax", "ebx", "ecx", "edx",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
"xmm4", "xmm5", "xmm6", "xmm7"
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
chi1 = x;
|
||||
psi1 = y;
|
||||
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
bl2_ddcopys( *chi1, *psi1 );
|
||||
|
||||
chi1 += incx;
|
||||
psi1 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,78 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
ctype* p_edge = p_begin + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( *n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = *n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p_begin + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( rs_p == 1 ) { \
|
||||
printf( "packm_blk_var2: ps_p = %u\n", ps_p ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: p copied", m_panel_max, n_panel_max, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
} \
|
||||
\
|
||||
if ( rs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: c copied", m_panel_max, n_panel_max, \
|
||||
p_begin, 1, panel_dim, "%4.1f", "" ); \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm, packm_blk_var2 )
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
\
|
||||
if ( bl2_is_lower( uploc ) ) panel_off_i = 0; \
|
||||
else panel_off_i = bl2_max( 0, diagoffc_i ); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
panel_len_i = panel_len; \
|
||||
panel_off_i = 0; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
c_use = c_begin + panel_off_i*ldc; \
|
||||
p_use = p_begin + panel_off_i*panel_dim; \
|
||||
\
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
panel_dim_i, \
|
||||
panel_len_i, \
|
||||
beta_cast, \
|
||||
c_use, incc, ldc, \
|
||||
p_use, panel_dim ); \
|
||||
\
|
||||
/*
|
||||
if ( bl2_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,setd)( diagoffc_i, \
|
||||
*m_panel, \
|
||||
*n_panel, \
|
||||
beta_cast, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
|
||||
bl2_is_upper_or_lower( uploc ) && \
|
||||
densify == TRUE ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_densify)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
transc, \
|
||||
*m_panel, \
|
||||
*n_panel, \
|
||||
beta_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
panel_dim_i, \
|
||||
panel_len, \
|
||||
beta_cast, \
|
||||
c_begin, incc, ldc, \
|
||||
p_begin, panel_dim ); \
|
||||
*/ \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This simplifies the
|
||||
register level micro kernel in that it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( *m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t m_edge = m_panel_max - *m_panel; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p_begin + (*m_panel )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( *n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - *n_panel; \
|
||||
ctype* p_edge = p_begin + (*n_panel )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm, packm_blk_var3 )
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,108 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
FALSE, \
|
||||
m_a10, \
|
||||
n_a10, \
|
||||
m_max_a10, \
|
||||
n_max_a10, \
|
||||
beta, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, ps_pt ); \
|
||||
\
|
||||
p_cast += m_max_a10 * n_max_a10; \
|
||||
} \
|
||||
\
|
||||
/* Pack triangle subpartition A11. */ \
|
||||
{ \
|
||||
j = n_a10; \
|
||||
c_begin = c_cast + (0 )*rs_c + (j )*cs_c; \
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
/* This instance of ps_pt is not used by var3. */ \
|
||||
ps_pt = cs_p * n_max_a11; \
|
||||
\
|
||||
PASTEMAC(ch,packm_blk_var3)( BLIS_TRIANGULAR, \
|
||||
0, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
densify, \
|
||||
invdiag, \
|
||||
m_a11, \
|
||||
n_a11, \
|
||||
m_max_a11, \
|
||||
n_max_a11, \
|
||||
beta, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, ps_p ); \
|
||||
\
|
||||
p_cast += m_max_a11 * n_max_a11; \
|
||||
} \
|
||||
\
|
||||
/* If they exist, pack subpartitions A20 and A21. */ \
|
||||
if ( m_a2021 ) \
|
||||
{ \
|
||||
i = m_a10; \
|
||||
c_begin = c_cast + (i )*rs_c + (0 )*cs_c; \
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
ps_pt = cs_p * n_max_a2021; \
|
||||
\
|
||||
PASTEMAC(ch,packm_blk_var2)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
FALSE, \
|
||||
m_a2021, \
|
||||
n_a2021, \
|
||||
m_max_a2021, \
|
||||
n_max_a2021, \
|
||||
beta, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, ps_pt ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
bl2_abort(); \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm, packm_blk_var4 )
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,91 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
/* Pack subpartitions A10 and A11. */ \
|
||||
{ \
|
||||
c_begin = c_cast + (0 )*rs_c + (0 )*cs_c; \
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
PASTEMAC(ch,packm_blk_var3)( BLIS_TRIANGULAR, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
densify, \
|
||||
invdiag, \
|
||||
revifup, \
|
||||
reviflo, \
|
||||
m_a1011, \
|
||||
n_a1011, \
|
||||
m_max_a1011, \
|
||||
n_max_a1011, \
|
||||
beta, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, ps_p ); \
|
||||
\
|
||||
p_cast += step_a1011; \
|
||||
} \
|
||||
\
|
||||
/* If they exist, pack subpartitions A20 and A21. */ \
|
||||
if ( m_a2021 ) \
|
||||
{ \
|
||||
i = m_a1011; \
|
||||
c_begin = c_cast + (i )*rs_c + (0 )*cs_c; \
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
ps_pt = cs_p * n_max_a2021; \
|
||||
\
|
||||
PASTEMAC(ch,packm_blk_var2)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
FALSE, \
|
||||
m_a2021, \
|
||||
n_a2021, \
|
||||
m_max_a2021, \
|
||||
n_max_a2021, \
|
||||
beta, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, ps_pt ); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm, packm_blk_var4 )
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,238 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_sgemm_asm_var2,
|
||||
bl2_cgemm_asm_var2,
|
||||
bl2_dgemm_asm_var2,
|
||||
bl2_zgemm_asm_var2
|
||||
};
|
||||
|
||||
void bl2_gemm_asm_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
num_t dt_a = bl2_obj_datatype( *a );
|
||||
num_t dt_b = bl2_obj_datatype( *b );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate the m and leading dimensions
|
||||
// by a factor of two.
|
||||
/*
|
||||
if ( bl2_is_complex( dt_a ) && bl2_is_real( dt_b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
void PASTEMAC(s,gemm_asm_var2)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void PASTEMAC(c,gemm_asm_var2)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void PASTEMAC(z,gemm_asm_var2)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
#include "pmmintrin.h"
|
||||
|
||||
typedef union
|
||||
{
|
||||
__m128d v;
|
||||
double d[2];
|
||||
} v2df_t;
|
||||
|
||||
#define NOSSE 0
|
||||
|
||||
void PASTEMAC(d,gemm_asm_var2)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
double* restrict a_cast = a;
|
||||
double* restrict b_cast = b;
|
||||
double* restrict c_cast = c;
|
||||
double* restrict a1;
|
||||
double* restrict b1;
|
||||
double* restrict c1;
|
||||
double* restrict alpha11;
|
||||
double* restrict beta11;
|
||||
double* restrict gamma11;
|
||||
dim_t i, j, h;
|
||||
v2df_t b1v;;
|
||||
v2df_t a1v, a2v;;
|
||||
v2df_t c1v, c2v;
|
||||
|
||||
dim_t m_iter = m / 2;
|
||||
dim_t m_left = m % 2;
|
||||
|
||||
inc_t step_a = 2*rs_a;
|
||||
inc_t step_c = 2*rs_c;
|
||||
|
||||
for ( j = 0; j < n; ++j )
|
||||
{
|
||||
c1 = c_cast + (j )* cs_c;
|
||||
b1 = b_cast + (j )* cs_b;
|
||||
|
||||
for ( h = 0; h < k; ++h )
|
||||
{
|
||||
a1 = a_cast + (h )*cs_a;
|
||||
beta11 = b1 + (h )*rs_b;
|
||||
|
||||
#if NOSSE
|
||||
#else
|
||||
b1v.v = _mm_loaddup_pd( beta11 );
|
||||
#endif
|
||||
|
||||
alpha11 = a1;
|
||||
gamma11 = c1;
|
||||
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
#if NOSSE
|
||||
*(gamma11 ) += *beta11 * *(alpha11 );
|
||||
*(gamma11+1) += *beta11 * *(alpha11+1);
|
||||
*(gamma11+2) += *beta11 * *(alpha11+2);
|
||||
*(gamma11+3) += *beta11 * *(alpha11+3);
|
||||
#else
|
||||
a1v.v = _mm_load_pd( alpha11 );
|
||||
//a2v.v = _mm_load_pd( alpha11+2 );
|
||||
c1v.v = _mm_load_pd( gamma11 );
|
||||
//c2v.v = _mm_load_pd( gamma11+2 );
|
||||
|
||||
c1v.v += b1v.v * a1v.v;
|
||||
//c2v.v += b1v.v * a2v.v;
|
||||
|
||||
_mm_store_pd( gamma11, c1v.v );
|
||||
//_mm_store_pd( gamma11+2, c2v.v );
|
||||
#endif
|
||||
|
||||
alpha11 += step_a;
|
||||
gamma11 += step_c;
|
||||
}
|
||||
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
*(gamma11 ) += *beta11 * *(alpha11 );
|
||||
|
||||
alpha11 += rs_a;
|
||||
gamma11 += rs_c;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
void bl2_gemm_asm_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( chabc, varname ) \
|
||||
\
|
||||
void PASTEMAC(chabc,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
GENPROT( s, gemm_asm_var2 )
|
||||
GENPROT( d, gemm_asm_var2 )
|
||||
GENPROT( c, gemm_asm_var2 )
|
||||
GENPROT( z, gemm_asm_var2 )
|
||||
|
||||
@@ -1,318 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_sgemm_asm_var3,
|
||||
bl2_cgemm_asm_var3,
|
||||
bl2_dgemm_asm_var3,
|
||||
bl2_zgemm_asm_var3
|
||||
};
|
||||
|
||||
void bl2_gemm_asm_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
//num_t dt_a = bl2_obj_datatype( *a );
|
||||
//num_t dt_b = bl2_obj_datatype( *b );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate the m and leading dimensions
|
||||
// by a factor of two.
|
||||
/*
|
||||
if ( bl2_is_complex( dt_a ) && bl2_is_real( dt_b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
void PASTEMAC(s,gemm_asm_var3)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void PASTEMAC(c,gemm_asm_var3)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void PASTEMAC(z,gemm_asm_var3)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
#include "pmmintrin.h"
|
||||
|
||||
typedef union
|
||||
{
|
||||
__m128d v;
|
||||
double d[2];
|
||||
} v2df_t;
|
||||
|
||||
#define NOSSE 0
|
||||
|
||||
void PASTEMAC(d,gemm_asm_var3)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
double* restrict a_cast = a;
|
||||
double* restrict b_cast = b;
|
||||
double* restrict c_cast = c;
|
||||
double* restrict a1;
|
||||
double* restrict b1;
|
||||
double* restrict c1;
|
||||
double* restrict a11;
|
||||
double* restrict b11;
|
||||
double* restrict c11;
|
||||
|
||||
double* restrict alpha00;
|
||||
double* restrict alpha20;
|
||||
double* restrict beta00;
|
||||
double* restrict beta01;
|
||||
|
||||
double* restrict gamma00;
|
||||
double* restrict gamma20;
|
||||
double* restrict gamma01;
|
||||
double* restrict gamma21;
|
||||
|
||||
v2df_t c00v, c01v;
|
||||
v2df_t c10v, c11v;
|
||||
v2df_t a0v, a1v;
|
||||
v2df_t b0v, b1v;
|
||||
|
||||
dim_t i, j, h;
|
||||
|
||||
dim_t n_iter = n / 2;
|
||||
dim_t n_left = n % 2;
|
||||
|
||||
dim_t m_iter = m / 4;
|
||||
dim_t m_left = m % 4;
|
||||
|
||||
dim_t k_iter = k / 2;
|
||||
dim_t k_left = k % 2;
|
||||
|
||||
b1 = b_cast;
|
||||
c1 = c_cast;
|
||||
|
||||
for ( j = 0; j < n_iter; ++j )
|
||||
{
|
||||
a1 = a_cast;
|
||||
c11 = c1;
|
||||
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
gamma00 = c11 + 0*rs_c + 0*cs_c;
|
||||
gamma20 = c11 + 2*rs_c + 0*cs_c;
|
||||
|
||||
gamma01 = c11 + 0*rs_c + 1*cs_c;
|
||||
gamma21 = c11 + 2*rs_c + 1*cs_c;
|
||||
|
||||
a11 = a1;
|
||||
b11 = b1;
|
||||
|
||||
c00v.v = _mm_load_pd( gamma00 );
|
||||
c10v.v = _mm_load_pd( gamma20 );
|
||||
c01v.v = _mm_load_pd( gamma01 );
|
||||
c11v.v = _mm_load_pd( gamma21 );
|
||||
|
||||
for ( h = 0; h < k_iter; ++h )
|
||||
{
|
||||
alpha00 = a11;
|
||||
alpha20 = a11 + 2;
|
||||
|
||||
beta00 = b11;
|
||||
beta01 = b11 + cs_b;
|
||||
//beta01 = b11 + 100;
|
||||
|
||||
|
||||
a0v.v = _mm_load_pd( alpha00 );
|
||||
a1v.v = _mm_load_pd( alpha20 );
|
||||
|
||||
b0v.v = _mm_loaddup_pd( beta00 );
|
||||
c00v.v += a0v.v * b0v.v;
|
||||
c10v.v += a1v.v * b0v.v;
|
||||
|
||||
b1v.v = _mm_loaddup_pd( beta01 );
|
||||
c01v.v += a0v.v * b1v.v;
|
||||
c11v.v += a1v.v * b1v.v;
|
||||
|
||||
|
||||
a0v.v = _mm_load_pd( alpha00 + cs_a );
|
||||
a1v.v = _mm_load_pd( alpha20 + cs_a );
|
||||
//a0v.v = _mm_load_pd( alpha00 + 100 );
|
||||
//a1v.v = _mm_load_pd( alpha20 + 100 );
|
||||
|
||||
b0v.v = _mm_loaddup_pd( beta00 + 1 );
|
||||
c00v.v += a0v.v * b0v.v;
|
||||
c10v.v += a1v.v * b0v.v;
|
||||
|
||||
b1v.v = _mm_loaddup_pd( beta01 + 1 );
|
||||
c01v.v += a0v.v * b1v.v;
|
||||
c11v.v += a1v.v * b1v.v;
|
||||
|
||||
|
||||
a11 += 2*cs_a;
|
||||
//a11 += 200;
|
||||
b11 += 2;
|
||||
}
|
||||
|
||||
for ( h = 0; h < k_left; ++h )
|
||||
{
|
||||
alpha00 = a11;
|
||||
alpha20 = a11 + 2;
|
||||
|
||||
beta00 = b11;
|
||||
beta01 = b11 + cs_b;
|
||||
|
||||
a0v.v = _mm_load_pd( alpha00 );
|
||||
a1v.v = _mm_load_pd( alpha20 );
|
||||
|
||||
b0v.v = _mm_loaddup_pd( beta00 );
|
||||
|
||||
c00v.v += a0v.v * b0v.v;
|
||||
c10v.v += a1v.v * b0v.v;
|
||||
|
||||
b1v.v = _mm_loaddup_pd( beta01 );
|
||||
|
||||
c01v.v += a0v.v * b1v.v;
|
||||
c11v.v += a1v.v * b1v.v;
|
||||
|
||||
a11 += cs_a;
|
||||
b11 += 1;
|
||||
}
|
||||
|
||||
_mm_store_pd( gamma00, c00v.v );
|
||||
_mm_store_pd( gamma20, c10v.v );
|
||||
_mm_store_pd( gamma01, c01v.v );
|
||||
_mm_store_pd( gamma21, c11v.v );
|
||||
|
||||
//a1 += 4*rs_a;
|
||||
//c11 += 4*rs_c;
|
||||
a1 += 4;
|
||||
c11 += 4;
|
||||
}
|
||||
/*
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
}
|
||||
*/
|
||||
b1 += 2*cs_b;
|
||||
c1 += 2*cs_c;
|
||||
}
|
||||
/*
|
||||
for ( j = 0; j < n_left ++j )
|
||||
{
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
void bl2_gemm_asm_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( chabc, varname ) \
|
||||
\
|
||||
void PASTEMAC(chabc,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
GENPROT( s, gemm_asm_var3 )
|
||||
GENPROT( d, gemm_asm_var3 )
|
||||
GENPROT( c, gemm_asm_var3 )
|
||||
GENPROT( z, gemm_asm_var3 )
|
||||
|
||||
@@ -1,336 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_sgemm_asm_var4,
|
||||
bl2_cgemm_asm_var4,
|
||||
bl2_dgemm_asm_var4,
|
||||
bl2_zgemm_asm_var4
|
||||
};
|
||||
|
||||
void bl2_gemm_asm_var4( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
//num_t dt_a = bl2_obj_datatype( *a );
|
||||
//num_t dt_b = bl2_obj_datatype( *b );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate the m and leading dimensions
|
||||
// by a factor of two.
|
||||
/*
|
||||
if ( bl2_is_complex( dt_a ) && bl2_is_real( dt_b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
void PASTEMAC(s,gemm_asm_var4)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void PASTEMAC(c,gemm_asm_var4)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void PASTEMAC(z,gemm_asm_var4)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
#include "pmmintrin.h"
|
||||
|
||||
typedef union
|
||||
{
|
||||
__m128d v;
|
||||
double d[2];
|
||||
} v2df_t;
|
||||
|
||||
#define NOSSE 0
|
||||
|
||||
void PASTEMAC(d,gemm_asm_var4)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
double* restrict a_cast = a;
|
||||
double* restrict b_cast = b;
|
||||
double* restrict c_cast = c;
|
||||
double* restrict a1;
|
||||
double* restrict b1;
|
||||
double* restrict c1;
|
||||
double* restrict a11;
|
||||
double* restrict b11;
|
||||
double* restrict c11;
|
||||
|
||||
double* restrict alpha00;
|
||||
double* restrict alpha20;
|
||||
double* restrict beta00;
|
||||
double* restrict beta01;
|
||||
|
||||
double* restrict gamma00;
|
||||
double* restrict gamma20;
|
||||
double* restrict gamma01;
|
||||
double* restrict gamma21;
|
||||
|
||||
v2df_t c00v, c01v;
|
||||
v2df_t c10v, c11v;
|
||||
v2df_t a0v, a1v;
|
||||
v2df_t b0v, b1v;
|
||||
|
||||
dim_t i, j, h;
|
||||
|
||||
dim_t n_iter = n / 2;
|
||||
dim_t n_left = n % 2;
|
||||
|
||||
dim_t m_iter = m / 4;
|
||||
dim_t m_left = m % 4;
|
||||
|
||||
//dim_t k_iter = k / 2;
|
||||
//dim_t k_left = k % 2;
|
||||
dim_t k_iter = k / 2;
|
||||
dim_t k_left = k % 2;
|
||||
|
||||
|
||||
b1 = b_cast;
|
||||
c1 = c_cast;
|
||||
|
||||
for ( j = 0; j < n_iter; ++j )
|
||||
{
|
||||
a1 = a_cast;
|
||||
c11 = c1;
|
||||
|
||||
gamma00 = c11;
|
||||
gamma20 = c11 + 2;
|
||||
|
||||
gamma01 = c11 + cs_c;
|
||||
gamma21 = c11 + 2 + cs_c;
|
||||
|
||||
for ( i = 0; i < m_iter; ++i )
|
||||
{
|
||||
/*
|
||||
gamma00 = c11 + 0*rs_c + 0*cs_c;
|
||||
gamma20 = c11 + 2*rs_c + 0*cs_c;
|
||||
|
||||
gamma01 = c11 + 0*rs_c + 1*cs_c;
|
||||
gamma21 = c11 + 2*rs_c + 1*cs_c;
|
||||
*/
|
||||
|
||||
a11 = a1;
|
||||
b11 = b1;
|
||||
|
||||
c00v.v = _mm_load_pd( gamma00 );
|
||||
c10v.v = _mm_load_pd( gamma20 );
|
||||
c01v.v = _mm_load_pd( gamma01 );
|
||||
c11v.v = _mm_load_pd( gamma21 );
|
||||
|
||||
alpha00 = a11;
|
||||
alpha20 = a11 + 2;
|
||||
|
||||
beta00 = b11;
|
||||
beta01 = b11 + cs_b;
|
||||
|
||||
for ( h = 0; h < k_iter; ++h )
|
||||
{
|
||||
|
||||
a0v.v = _mm_load_pd( alpha00 );
|
||||
a1v.v = _mm_load_pd( alpha20 );
|
||||
alpha00 += 4;
|
||||
alpha20 += 4;
|
||||
|
||||
b0v.v = _mm_loaddup_pd( beta00 );
|
||||
beta00 += 1;
|
||||
c00v.v += a0v.v * b0v.v;
|
||||
c10v.v += a1v.v * b0v.v;
|
||||
|
||||
b1v.v = _mm_loaddup_pd( beta01 );
|
||||
beta01 += 1;
|
||||
c01v.v += a0v.v * b1v.v;
|
||||
c11v.v += a1v.v * b1v.v;
|
||||
|
||||
|
||||
a0v.v = _mm_load_pd( alpha00 );
|
||||
a1v.v = _mm_load_pd( alpha20 );
|
||||
alpha00 += 4;
|
||||
alpha20 += 4;
|
||||
|
||||
b0v.v = _mm_loaddup_pd( beta00 );
|
||||
beta00 += 1;
|
||||
c00v.v += a0v.v * b0v.v;
|
||||
c10v.v += a1v.v * b0v.v;
|
||||
|
||||
b1v.v = _mm_loaddup_pd( beta01 );
|
||||
beta01 += 1;
|
||||
c01v.v += a0v.v * b1v.v;
|
||||
c11v.v += a1v.v * b1v.v;
|
||||
|
||||
|
||||
|
||||
//alpha00 += 8;
|
||||
//alpha20 += 8;
|
||||
|
||||
|
||||
}
|
||||
|
||||
for ( h = 0; h < k_left; ++h )
|
||||
{
|
||||
a0v.v = _mm_load_pd( alpha00 );
|
||||
a1v.v = _mm_load_pd( alpha20 );
|
||||
|
||||
b0v.v = _mm_loaddup_pd( beta00++ );
|
||||
c00v.v += a0v.v * b0v.v;
|
||||
c10v.v += a1v.v * b0v.v;
|
||||
|
||||
b1v.v = _mm_loaddup_pd( beta01++ );
|
||||
c01v.v += a0v.v * b1v.v;
|
||||
c11v.v += a1v.v * b1v.v;
|
||||
|
||||
alpha00 += 4;
|
||||
alpha20 += 4;
|
||||
}
|
||||
|
||||
_mm_store_pd( gamma00, c00v.v );
|
||||
_mm_store_pd( gamma20, c10v.v );
|
||||
_mm_store_pd( gamma01, c01v.v );
|
||||
_mm_store_pd( gamma21, c11v.v );
|
||||
|
||||
//a1 += 4*rs_a;
|
||||
//c11 += 4*rs_c;
|
||||
//a1 += 4;
|
||||
/*
|
||||
a1 += ps_a;
|
||||
c11 += 4;
|
||||
*/
|
||||
gamma00 += 4;
|
||||
gamma20 += 4;
|
||||
gamma01 += 4;
|
||||
gamma21 += 4;
|
||||
}
|
||||
/*
|
||||
for ( i = 0; i < m_left; ++i )
|
||||
{
|
||||
}
|
||||
*/
|
||||
b1 += 2*cs_b;
|
||||
c1 += 2*cs_c;
|
||||
}
|
||||
/*
|
||||
for ( j = 0; j < n_left ++j )
|
||||
{
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
void bl2_gemm_asm_var4( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( chabc, varname ) \
|
||||
\
|
||||
void PASTEMAC(chabc,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
GENPROT( s, gemm_asm_var4 )
|
||||
GENPROT( d, gemm_asm_var4 )
|
||||
GENPROT( c, gemm_asm_var4 )
|
||||
GENPROT( z, gemm_asm_var4 )
|
||||
|
||||
@@ -1,169 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_unb_var2);
|
||||
|
||||
|
||||
void bl2_gemm_unb_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate the m and leading dimensions
|
||||
// by a factor of two.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
ctype* a1; \
|
||||
ctype* b1; \
|
||||
ctype* c1; \
|
||||
ctype* alpha11; \
|
||||
ctype* beta11; \
|
||||
ctype* gamma11; \
|
||||
ctype rho; \
|
||||
dim_t i, j, h; \
|
||||
\
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
c1 = c; \
|
||||
b1 = b; \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
gamma11 = c1; \
|
||||
a1 = a; \
|
||||
\
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
/* gamma11 = c1 + (i )*rs_c + (j )*cs_c; */ \
|
||||
\
|
||||
alpha11 = a1; \
|
||||
beta11 = b1; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( rho ); \
|
||||
\
|
||||
for ( h = 0; h < k; ++h ) \
|
||||
{ \
|
||||
/* alpha11 = a1 + (i )*rs_a + (h )*cs_a; */ \
|
||||
/* beta11 = b1 + (h )*rs_b + (j )*cs_b; */ \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *alpha11, *beta11, rho ); \
|
||||
\
|
||||
alpha11 += cs_a; \
|
||||
beta11 += rs_b; \
|
||||
} \
|
||||
\
|
||||
PASTEMAC(ch,adds)( rho, *gamma11 ); \
|
||||
\
|
||||
gamma11 += rs_c; \
|
||||
a1 += rs_a; \
|
||||
} \
|
||||
\
|
||||
c1 += cs_c; \
|
||||
b1 += cs_b; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm, gemm_unb_var2 )
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
void bl2_gemm_unb_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_unb_var2 )
|
||||
|
||||
@@ -1,163 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_her2k_l_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* alpha_conj,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
her2k_t* cntl )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t bh_pack, bhL_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t ah_pack, ahL_pack;
|
||||
obj_t c1;
|
||||
obj_t c1L, c1L_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offL, nL;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bl2_obj_init_pack( &a1_pack );
|
||||
bl2_obj_init_pack( &bh_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
bl2_obj_init_pack( &ah_pack );
|
||||
bl2_obj_init_pack( &c1L_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bl2_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bl2_scalm_int( beta,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize objects for packing B' and A'.
|
||||
bl2_packm_init( bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B' and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1 and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding regions
|
||||
// of Bh_pack and Ah_pack. We compute the width of the subpartition
|
||||
// taking the location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &bh_pack, &bhL_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &ah_pack, &ahL_pack );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
&b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform her2k subproblem.
|
||||
bl2_her2k_int( alpha,
|
||||
&a1_pack,
|
||||
&bhL_pack,
|
||||
alpha_conj,
|
||||
&b1_pack,
|
||||
&ahL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
cntl_sub_her2k( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1L_pack );
|
||||
}
|
||||
|
||||
@@ -1,198 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
extern gemm_t* gemm_cntl_bp_ke;
|
||||
|
||||
void bl2_her2k_l_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* alpha_conj,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
her2k_t* cntl )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t bh_pack, bhL_pack, bhM_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t ah_pack, ahL_pack, ahM_pack;
|
||||
obj_t c1;
|
||||
obj_t c1L, c1M, c1L_pack, c1M_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offL, nL;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bl2_obj_init_pack( &a1_pack );
|
||||
bl2_obj_init_pack( &bh_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
bl2_obj_init_pack( &ah_pack );
|
||||
bl2_obj_init_pack( &c1L_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bl2_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bl2_scalm_int( beta,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize objects for packing B' and A'.
|
||||
bl2_packm_init( bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B' and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1 and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding regions
|
||||
// of Bh_pack and Ah_pack. We compute the width of the subpartition
|
||||
// taking the location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
// bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
|
||||
bl2_obj_diag_offset_after_trans( c1 ) );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &bh_pack, &bhL_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &ah_pack, &ahL_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
nL, b_alg, &c1, &c1M );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
nL, b_alg, &bh_pack, &bhM_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
nL, b_alg, &ah_pack, &ahM_pack );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bl2_packm_init( &c1M, &c1M_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
&b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
if ( bl2_obj_width( c1L ) > 0 )
|
||||
bl2_packm_int( beta,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1M, &c1M_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
if ( bl2_obj_width( c1L ) > 0 )
|
||||
{
|
||||
bl2_gemm_int( alpha,
|
||||
&a1_pack,
|
||||
&bhL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
gemm_cntl_bp_ke );
|
||||
|
||||
bl2_gemm_int( alpha_conj,
|
||||
&b1_pack,
|
||||
&ahL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
gemm_cntl_bp_ke );
|
||||
}
|
||||
|
||||
// Perform her2k subproblem.
|
||||
bl2_her2k_int( alpha,
|
||||
&a1_pack,
|
||||
&bhM_pack,
|
||||
alpha_conj,
|
||||
&b1_pack,
|
||||
&ahM_pack,
|
||||
beta,
|
||||
&c1M_pack,
|
||||
cntl_sub_her2k( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
if ( bl2_obj_width( c1L ) > 0 )
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1L_pack );
|
||||
}
|
||||
|
||||
@@ -1,246 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_her2k_l_blk_var4( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* alpha_conj,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
her2k_t* cntl )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t bh_pack, bhL_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t ah_pack, ahL_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t c1L, c1L_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t bm_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offL, nL;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bl2_obj_init_pack( &a1_pack );
|
||||
bl2_obj_init_pack( &bh_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
bl2_obj_init_pack( &ah_pack );
|
||||
bl2_obj_init_pack( &c1_pack );
|
||||
bl2_obj_init_pack( &c1L_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bl2_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bl2_scalm_int( beta,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing B'.
|
||||
bl2_packm_init( bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Initialize object for packing A'.
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Fuse the first iteration with incremental packing and computation.
|
||||
{
|
||||
obj_t bh_inc, bh_pack_inc;
|
||||
obj_t ah_inc, ah_pack_inc;
|
||||
obj_t c1_pack_inc;
|
||||
|
||||
dim_t j;
|
||||
dim_t bn_inc;
|
||||
dim_t n_trans;
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bl2_obj_width( bh_pack );
|
||||
|
||||
// Determine the current algorithmic blocksize.
|
||||
bm_alg = bl2_determine_blocksize_b( 0, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1, and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
0, bm_alg, a, &a1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
0, bm_alg, b, &b1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
0, bm_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta, &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha, &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj, &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( j = 0; j < n_trans; j += bn_inc )
|
||||
{
|
||||
// Determine the current incremental packing blocksize.
|
||||
bn_inc = bl2_determine_blocksize_f( j, n_trans, a,
|
||||
cntl_blocksize_aux( cntl ) );
|
||||
|
||||
// Acquire incremental partitions.
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
bh, &bh_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&bh_pack, &bh_pack_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&c1_pack, &c1_pack_inc );
|
||||
|
||||
// Acquire incremental partitions.
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
ah, &ah_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&ah_pack, &ah_pack_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&c1_pack, &c1_pack_inc );
|
||||
|
||||
// Pack Bh_inc and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha, &bh_inc, &bh_pack_inc, cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&bh_pack_inc,
|
||||
&BLIS_ONE,
|
||||
&c1_pack_inc,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Pack Ah_inc and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj, &ah_inc, &ah_pack_inc, cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( &BLIS_ONE,
|
||||
&b1_pack,
|
||||
&ah_pack_inc,
|
||||
&BLIS_ONE,
|
||||
&c1_pack_inc,
|
||||
cntl_sub_herk( cntl ) );
|
||||
}
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1_pack, &c1, cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = bm_alg; i < m_trans; i += bm_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
bm_alg = bl2_determine_blocksize_b( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1, and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, bm_alg, a, &a1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, bm_alg, b, &b1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, bm_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding regions
|
||||
// of Bh_pack and Ah_pack. We compute the width of the subpartition
|
||||
// taking the location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
bl2_obj_diag_offset_after_trans( c1 ) + bm_alg );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &bh_pack, &bhL_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &ah_pack, &ahL_pack );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
&b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_her2k_int( alpha,
|
||||
&a1_pack,
|
||||
&bhL_pack,
|
||||
alpha_conj,
|
||||
&b1_pack,
|
||||
&ahL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
cntl_sub_her2k( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1_pack );
|
||||
bl2_obj_release_pack( &c1L_pack );
|
||||
}
|
||||
|
||||
@@ -1,245 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_her2k_u_blk_var4( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* alpha_conj,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
her2k_t* cntl )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t bh_pack, bhR_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t ah_pack, ahR_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t c1R, c1R_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t bm_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offR, nR;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bl2_obj_init_pack( &a1_pack );
|
||||
bl2_obj_init_pack( &bh_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
bl2_obj_init_pack( &ah_pack );
|
||||
bl2_obj_init_pack( &c1_pack );
|
||||
bl2_obj_init_pack( &c1R_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bl2_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bl2_scalm_int( beta,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing B1'.
|
||||
bl2_packm_init( bh, &bh_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Initialize object for packing A1'.
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Fuse the first iteration with incremental packing and computation.
|
||||
{
|
||||
obj_t bh_inc, bh_pack_inc;
|
||||
obj_t ah_inc, ah_pack_inc;
|
||||
obj_t c1_pack_inc;
|
||||
|
||||
dim_t j;
|
||||
dim_t bn_inc;
|
||||
dim_t n_trans;
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bl2_obj_width( bh_pack );
|
||||
|
||||
// Determine the current algorithmic blocksize.
|
||||
bm_alg = bl2_determine_blocksize_f( 0, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1, and C1.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
0, bm_alg, a, &a1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
0, bm_alg, b, &b1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
0, bm_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta, &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha, &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj, &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( j = 0; j < n_trans; j += bn_inc )
|
||||
{
|
||||
// Determine the current incremental packing blocksize.
|
||||
bn_inc = bl2_determine_blocksize_f( j, n_trans, a,
|
||||
cntl_blocksize_aux( cntl ) );
|
||||
|
||||
// Acquire incremental partitions.
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
bh, &bh_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&bh_pack, &bh_pack_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&c1_pack, &c1_pack_inc );
|
||||
|
||||
// Acquire incremental partitions.
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
ah, &ah_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&ah_pack, &ah_pack_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
|
||||
&c1_pack, &c1_pack_inc );
|
||||
|
||||
// Pack Bh_inc and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha, &bh_inc, &bh_pack_inc, cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&bh_pack_inc,
|
||||
&BLIS_ONE,
|
||||
&c1_pack_inc,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Pack Ah_inc and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj, &ah_inc, &ah_pack_inc, cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( &BLIS_ONE,
|
||||
&b1_pack,
|
||||
&ah_pack_inc,
|
||||
&BLIS_ONE,
|
||||
&c1_pack_inc,
|
||||
cntl_sub_herk( cntl ) );
|
||||
}
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1_pack, &c1, cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = bm_alg; i < m_trans; i += bm_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
bm_alg = bl2_determine_blocksize_f( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1, B1, and C1.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, bm_alg, a, &a1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, bm_alg, b, &b1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, bm_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding regions
|
||||
// of Bh_pack and Ah_pack. We compute the width of the subpartition
|
||||
// taking the location of the diagonal into account.
|
||||
offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) );
|
||||
nR = bl2_obj_width_after_trans( c1 ) - offR;
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &c1, &c1R );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &bh_pack, &bhR_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offR, nR, &ah_pack, &ahR_pack );
|
||||
|
||||
// Initialize objects for packing A1, B1, and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1R, &c1R_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack B1 and scale by alpha_conj (if instructed).
|
||||
bl2_packm_int( alpha_conj,
|
||||
&b1, &b1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1R, &c1R_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_her2k_int( alpha,
|
||||
&a1_pack,
|
||||
&bhR_pack,
|
||||
alpha_conj,
|
||||
&b1_pack,
|
||||
&ahR_pack,
|
||||
beta,
|
||||
&c1R_pack,
|
||||
cntl_sub_her2k( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1R_pack, &c1R,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &bh_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1_pack );
|
||||
bl2_obj_release_pack( &c1R_pack );
|
||||
}
|
||||
|
||||
@@ -1,358 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T her2k_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffc,
|
||||
uplo_t uploc,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t ps_a,
|
||||
void* bh, inc_t ps_bh,
|
||||
void* b, inc_t ps_b,
|
||||
void* ah, inc_t ps_ah,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,her2k_u_ker_var3);
|
||||
|
||||
|
||||
void bl2_her2k_u_ker_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* alpha_conj,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
her2k_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
doff_t diagoffc = bl2_obj_diag_offset( *c );
|
||||
uplo_t uploc = bl2_obj_uplo( *c );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_bh = bl2_obj_buffer_at_off( *bh );
|
||||
inc_t ps_bh = bl2_obj_panel_stride( *bh );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t ps_b = bl2_obj_panel_stride( *b );
|
||||
|
||||
void* buf_ah = bl2_obj_buffer_at_off( *ah );
|
||||
inc_t ps_ah = bl2_obj_panel_stride( *ah );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
//cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffc,
|
||||
uploc,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, ps_a,
|
||||
buf_bh, ps_bh,
|
||||
buf_b, ps_b,
|
||||
buf_ah, ps_ah,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t ps_a, \
|
||||
void* bh, inc_t ps_bh, \
|
||||
void* b, inc_t ps_b, \
|
||||
void* ah, inc_t ps_ah, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary b buffers for duplicating elements of bh, ah. */ \
|
||||
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
ctype ad[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
\
|
||||
/* Temporary c buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC2(ch,varname,_mr) * PASTEMAC2(ch,varname,_nr) ]; \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
|
||||
\
|
||||
/* Alias the m and n register blocksizes to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
|
||||
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
|
||||
\
|
||||
ctype* a_cast = a; \
|
||||
ctype* bh_cast = bh; \
|
||||
ctype* b_cast = b; \
|
||||
ctype* ah_cast = ah; \
|
||||
ctype* c_cast = c; \
|
||||
ctype* a1; \
|
||||
ctype* bh1; \
|
||||
ctype* b1; \
|
||||
ctype* ah1; \
|
||||
ctype* c1; \
|
||||
ctype* c11; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
\
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = MR * rs_c; \
|
||||
cstep_c = NR * cs_c; \
|
||||
\
|
||||
bh1 = bh_cast; \
|
||||
ah1 = ah_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
b1 = b_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Copy the current iteration's NR columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k, bh1, bd ); \
|
||||
PASTEMAC2(ch,varname,_dupl)( k, ah1, ad ); \
|
||||
\
|
||||
/* Interior loop. */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute in the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly above the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly below the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
if ( bl2_intersects_diag_n( diagoffc_ij, MR, NR ) ) \
|
||||
{ \
|
||||
/* Zero the temporary C buffer. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
a1, \
|
||||
bd, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
b1, \
|
||||
ad, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to only the stored part of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn_u)( diagoffc_ij, \
|
||||
MR, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bl2_is_strictly_above_diag_n( diagoffc_ij, MR, NR ) ) \
|
||||
{ \
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
a1, \
|
||||
bd, \
|
||||
c11, rs_c, cs_c ); \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
b1, \
|
||||
ad, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
b1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom edge handling. This case never occurs since the bottom
|
||||
edge is never reached as part of the interior loop. (It is only
|
||||
updated as part of the bottom-right corner handling below.) */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
; \
|
||||
} \
|
||||
\
|
||||
bh1 += cstep_b; \
|
||||
ah1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
if ( n_left ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
b1 = b_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Copy the n_left (+ padding) columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k, bh1, bd ); \
|
||||
PASTEMAC2(ch,varname,_dupl)( k, ah1, ad ); \
|
||||
\
|
||||
/* Right edge loop. (Note that the diagonal is guaranteed not
|
||||
to factor in here.) */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Zero the temporary C buffer. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
a1, \
|
||||
bd, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
b1, \
|
||||
ad, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the right edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
b1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Compute the diagonal offset one last time. */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Zero the temporary C buffer. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
a1, \
|
||||
bd, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
b1, \
|
||||
ad, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to only the stored part of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn_u)( diagoffc_ij, \
|
||||
m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( her2k, her2k_u_ker_var3 )
|
||||
|
||||
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Default register blocksizes and micro-kernel shapes
|
||||
//
|
||||
// NOTE: These MR and NR values below MUST match the values that packm uses
|
||||
// when initializing its control tree node.
|
||||
//
|
||||
#include "bl2_gemm_4x2.h"
|
||||
#include "bl2_dupl_kx2.h"
|
||||
|
||||
#define bl2_sher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_S
|
||||
#define bl2_sher2k_u_ker_var3_kc BLIS_DEFAULT_KC_S
|
||||
#define bl2_sher2k_u_ker_var3_mr BLIS_DEFAULT_MR_S
|
||||
#define bl2_sher2k_u_ker_var3_nr BLIS_DEFAULT_NR_S
|
||||
#define bl2_sher2k_u_ker_var3_ukr bl2_sgemm_4x2
|
||||
#define bl2_sher2k_u_ker_var3_dupl bl2_sdupl_kx2
|
||||
|
||||
#define bl2_dher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_D
|
||||
#define bl2_dher2k_u_ker_var3_kc BLIS_DEFAULT_KC_D
|
||||
#define bl2_dher2k_u_ker_var3_mr BLIS_DEFAULT_MR_D
|
||||
#define bl2_dher2k_u_ker_var3_nr BLIS_DEFAULT_NR_D
|
||||
#define bl2_dher2k_u_ker_var3_ukr bl2_dgemm_4x2
|
||||
#define bl2_dher2k_u_ker_var3_dupl bl2_ddupl_kx2
|
||||
|
||||
#define bl2_cher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define bl2_cher2k_u_ker_var3_kc BLIS_DEFAULT_KC_C
|
||||
#define bl2_cher2k_u_ker_var3_mr BLIS_DEFAULT_MR_C
|
||||
#define bl2_cher2k_u_ker_var3_nr BLIS_DEFAULT_NR_C
|
||||
#define bl2_cher2k_u_ker_var3_ukr bl2_cgemm_4x2
|
||||
#define bl2_cher2k_u_ker_var3_dupl bl2_cdupl_kx2
|
||||
|
||||
#define bl2_zher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_Z
|
||||
#define bl2_zher2k_u_ker_var3_kc BLIS_DEFAULT_KC_Z
|
||||
#define bl2_zher2k_u_ker_var3_mr BLIS_DEFAULT_MR_Z
|
||||
#define bl2_zher2k_u_ker_var3_nr BLIS_DEFAULT_NR_Z
|
||||
#define bl2_zher2k_u_ker_var3_ukr bl2_zgemm_4x2
|
||||
#define bl2_zher2k_u_ker_var3_dupl bl2_zdupl_kx2
|
||||
|
||||
|
||||
|
||||
void bl2_her2k_u_ker_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* alpha_conj,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
her2k_t* cntl );
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t ps_a, \
|
||||
void* bh, inc_t ps_bh, \
|
||||
void* b, inc_t ps_b, \
|
||||
void* ah, inc_t ps_ah, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( her2k_u_ker_var3 )
|
||||
|
||||
@@ -1,133 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_herk_l_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t ah_pack, ahL_pack;
|
||||
obj_t c1;
|
||||
obj_t c1L, c1L_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offL, nL;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bl2_obj_init_pack( &a1_pack );
|
||||
bl2_obj_init_pack( &ah_pack );
|
||||
bl2_obj_init_pack( &c1L_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bl2_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bl2_scalm_int( beta,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A'.
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of Ah_pack. We compute the width of the subpartition taking the
|
||||
// location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &ah_pack, &ahL_pack );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( alpha,
|
||||
&a1_pack,
|
||||
&ahL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1L_pack );
|
||||
}
|
||||
|
||||
@@ -1,160 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
extern gemm_t* gemm_cntl_bp_ke;
|
||||
|
||||
void bl2_herk_l_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t ah_pack, ahL_pack, ahM_pack;
|
||||
obj_t c1, c1M;
|
||||
obj_t c1L, c1L_pack, c1M_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offL, nL;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bl2_obj_init_pack( &a1_pack );
|
||||
bl2_obj_init_pack( &ah_pack );
|
||||
bl2_obj_init_pack( &c1L_pack );
|
||||
bl2_obj_init_pack( &c1M_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bl2_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bl2_scalm_int( beta,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A'.
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of Ah_pack. We compute the width of the subpartition taking the
|
||||
// location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
bl2_obj_diag_offset_after_trans( c1 ) );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &ah_pack, &ahL_pack );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
nL, b_alg, &c1, &c1M );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
nL, b_alg, &ah_pack, &ahM_pack );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bl2_packm_init( &c1M, &c1M_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1M, &c1M_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_gemm_int( alpha,
|
||||
&a1_pack,
|
||||
&ahL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
gemm_cntl_bp_ke );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( alpha,
|
||||
&a1_pack,
|
||||
&ahM_pack,
|
||||
beta,
|
||||
&c1M_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1M_pack, &c1M,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1L_pack );
|
||||
bl2_obj_release_pack( &c1M_pack );
|
||||
}
|
||||
|
||||
@@ -1,143 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_herk_l_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t ah_pack, ahL_pack;
|
||||
obj_t c1;
|
||||
obj_t c1L, c1L_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offL, nL;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bl2_obj_init_pack( &a1_pack );
|
||||
bl2_obj_init_pack( &ah_pack );
|
||||
bl2_obj_init_pack( &c1L_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bl2_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bl2_scalm_int( beta,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A'.
|
||||
bl2_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' and scale by alpha (if instructed).
|
||||
//bl2_packm_int( alpha,
|
||||
// ah, &ah_pack,
|
||||
// cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
{
|
||||
obj_t ah_inc, ah_pack_inc;
|
||||
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bl2_determine_blocksize_f( i, m_trans, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, a, &a1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of Ah_pack. We compute the width of the subpartition taking the
|
||||
// location of the diagonal into account.
|
||||
offL = 0;
|
||||
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
|
||||
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &c1, &c1L );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
offL, nL, &ah_pack, &ahL_pack );
|
||||
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
i, b_alg, ah, &ah_inc );
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
i, b_alg, &ah_pack, &ah_pack_inc );
|
||||
bl2_packm_int( alpha,
|
||||
&ah_inc, &ah_pack_inc,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bl2_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bl2_packm_init( &c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack A1 and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack C1 and scale by beta (if instructed).
|
||||
bl2_packm_int( beta,
|
||||
&c1L, &c1L_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bl2_herk_int( alpha,
|
||||
&a1_pack,
|
||||
&ahL_pack,
|
||||
beta,
|
||||
&c1L_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bl2_unpackm_int( &c1L_pack, &c1L,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a1_pack );
|
||||
bl2_obj_release_pack( &ah_pack );
|
||||
bl2_obj_release_pack( &c1L_pack );
|
||||
}
|
||||
|
||||
@@ -1,465 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffa,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_l_ker_var2);
|
||||
|
||||
|
||||
void bl2_trmm_l_ker_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
doff_t diagoffa = bl2_obj_diag_offset( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
inc_t ps_b = bl2_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// If beta is a scalar constant, use dt_exec to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
// (If beta is complex with a zero imaginary component, that is reflected
|
||||
// in dt_beta. However, that functionality is not used here.)
|
||||
bl2_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
|
||||
PASTEMAC2(ch,varname,_nr) ]; \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
|
||||
\
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
|
||||
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
|
||||
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict bd_i; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t k_ndup; \
|
||||
dim_t k_a1011; \
|
||||
dim_t off_a1011; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* If the diagonal offset is negative, adjust the pointer to C and
|
||||
treat this case as if the diagonal offset were zero. Note that
|
||||
we don't need to adjust the pointer to A since packm would have
|
||||
simply skipped over the panels that were not stored. */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
k_a1011 = bl2_min( k, diagoffa + m ); \
|
||||
k_ndup = k_a1011 * NDUP; \
|
||||
\
|
||||
rstep_a = k * MR; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = MR * rs_c; \
|
||||
cstep_c = NR * cs_c; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Copy the current iteration's NR columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
|
||||
\
|
||||
/* Interior loop. */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bl2_min( k, diagoffa_i + MR ); \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom edge handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bl2_min( k, diagoffa_i + m_left ); \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
if ( n_left ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Copy the n_left (+ padding) columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
|
||||
\
|
||||
/* Right edge loop. */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bl2_min( k, diagoffa_i + MR ); \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bl2_min( k, diagoffa_i + m_left ); \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_l_ker_var2 )
|
||||
|
||||
@@ -1,218 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom edge handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Use the diagonal offset for the current panel of A to compute
|
||||
k_use <= k so that we minimize the number of flops with zeros
|
||||
(ie: when the current panel intersects the diagonal). */ \
|
||||
diagoffa_i = diagoffa + (doff_t)i*MR; \
|
||||
k_diag = diagoffa_i + MR; \
|
||||
if ( k_diag < 0 ) k_use = 0; \
|
||||
else if ( k_diag > k ) k_use = k; \
|
||||
else k_use = k_diag; \
|
||||
\
|
||||
/* If the current panel intersects the diagonal, we need to
|
||||
scale by beta. (When the the current function is invoked as
|
||||
part of classic trmm, beta will be zero, and when invoked as
|
||||
part of trmm3, beta will be non-zero). If the current panel
|
||||
does not intersect the diagonal (but still has non-zero
|
||||
elements), we accumulate into C (for both trmm and trmm3). */ \
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_use, \
|
||||
a1, \
|
||||
bd, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( k_use != 0 ) \
|
||||
{ \
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_use, \
|
||||
a1, \
|
||||
bd, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
if ( n_left ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Copy the n_left (+ padding) columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k, b1, bd ); \
|
||||
\
|
||||
/* Right edge loop. */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Use the diagonal offset for the current panel of A to compute
|
||||
k_use <= k so that we minimize the number of flops with zeros
|
||||
(ie: when the current panel intersects the diagonal). */ \
|
||||
diagoffa_i = diagoffa + (doff_t)i*MR; \
|
||||
k_diag = diagoffa_i + MR; \
|
||||
if ( k_diag < 0 ) k_use = 0; \
|
||||
else if ( k_diag > k ) k_use = k; \
|
||||
else k_use = k_diag; \
|
||||
\
|
||||
/* If the current panel intersects the diagonal, we need to
|
||||
scale by beta. (When the the current function is invoked as
|
||||
part of classic trmm, beta will be zero, and when invoked as
|
||||
part of trmm3, beta will be non-zero). If the current panel
|
||||
does not intersect the diagonal (but still has non-zero
|
||||
elements), we accumulate into C (for both trmm and trmm3). */ \
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_use, \
|
||||
a1, \
|
||||
bd, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the right edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( k_use != 0 ) \
|
||||
{ \
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_use, \
|
||||
a1, \
|
||||
bd, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the right edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Use the diagonal offset for the current panel of A to compute
|
||||
k_use <= k so that we minimize the number of flops with zeros
|
||||
(ie: when the current panel intersects the diagonal). */ \
|
||||
diagoffa_i = diagoffa + (doff_t)i*MR; \
|
||||
k_diag = diagoffa_i + MR; \
|
||||
if ( k_diag < 0 ) k_use = 0; \
|
||||
else if ( k_diag > k ) k_use = k; \
|
||||
else k_use = k_diag; \
|
||||
\
|
||||
/* If the current panel intersects the diagonal, we need to
|
||||
scale by beta. (When the the current function is invoked as
|
||||
part of classic trmm, beta will be zero, and when invoked as
|
||||
part of trmm3, beta will be non-zero). If the current panel
|
||||
does not intersect the diagonal (but still has non-zero
|
||||
elements), we accumulate into C (for both trmm and trmm3). */ \
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_use, \
|
||||
a1, \
|
||||
bd, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom-right corner of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( k_use != 0 ) \
|
||||
{ \
|
||||
/* Invoke the micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_use, \
|
||||
a1, \
|
||||
bd, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the bottom-right corner of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_l_ker_var2 )
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,126 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_trmm_ll_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
obj_t a11, a11_pack;
|
||||
obj_t a10;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t b0;
|
||||
|
||||
dim_t ij;
|
||||
dim_t b_alg;
|
||||
dim_t mn;
|
||||
|
||||
// Initialize objects for packing.
|
||||
bl2_obj_init_pack( &a11_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
|
||||
// Query dimension. Since A should be square, any transposition
|
||||
// embedded in the object can be ignored.
|
||||
mn = bl2_obj_length( *a );
|
||||
|
||||
// Scale B by alpha (if instructed).
|
||||
bl2_scalm_int( alpha,
|
||||
b,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bl2_determine_blocksize_b( ij, mn,
|
||||
a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A11 and A10.
|
||||
bl2_acquire_mpart_br2tl( BLIS_SUBPART11,
|
||||
ij, b_alg, a, &a11 );
|
||||
bl2_acquire_mpart_br2tl( BLIS_SUBPART10,
|
||||
ij, b_alg, a, &a10 );
|
||||
|
||||
// Acquire partitions for B1 and B0.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
ij, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART0,
|
||||
ij, b_alg, b, &b0 );
|
||||
|
||||
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a11,
|
||||
&a11_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&b1,
|
||||
&b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// B1 = tril( A11 ) * B1;
|
||||
bl2_trmm_int( BLIS_LEFT,
|
||||
&BLIS_ONE,
|
||||
&a11_pack,
|
||||
&b1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
|
||||
// B1 = B1 + A10 * B0;
|
||||
bl2_gemm_int( &BLIS_ONE,
|
||||
&a10,
|
||||
&b0,
|
||||
&BLIS_ONE,
|
||||
&b1_pack,
|
||||
cntl_sub_gemm( cntl ) );
|
||||
|
||||
// Copy/unpack B1 (if B1 was packed).
|
||||
bl2_unpackm_int( &b1_pack,
|
||||
&b1,
|
||||
cntl_sub_unpackm_b( cntl ) );
|
||||
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a11_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_ll_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_trmm_ll_blk_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
obj_t a11, a11_pack;
|
||||
obj_t a21;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t b2;
|
||||
|
||||
dim_t ij;
|
||||
dim_t b_alg;
|
||||
dim_t mn;
|
||||
|
||||
// Initialize objects for packing.
|
||||
bl2_obj_init_pack( &a11_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
|
||||
// Query dimension. Since A should be square, any transposition
|
||||
// embedded in the object can be ignored.
|
||||
mn = bl2_obj_length( *a );
|
||||
|
||||
// Scale B by alpha (if instructed).
|
||||
bl2_scalm_int( alpha,
|
||||
b,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bl2_determine_blocksize_b( ij, mn,
|
||||
a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A11 and A01.
|
||||
bl2_acquire_mpart_br2tl( BLIS_SUBPART11,
|
||||
ij, b_alg, a, &a11 );
|
||||
bl2_acquire_mpart_br2tl( BLIS_SUBPART21,
|
||||
ij, b_alg, a, &a21 );
|
||||
|
||||
// Acquire partitions for B1 and B0.
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
ij, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_b2t( BLIS_SUBPART2,
|
||||
ij, b_alg, b, &b2 );
|
||||
|
||||
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a11,
|
||||
&a11_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&b1,
|
||||
&b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// B2 = B2 + A21 * B1;
|
||||
bl2_gemm_int( &BLIS_ONE,
|
||||
&a21,
|
||||
&b1_pack,
|
||||
&BLIS_ONE,
|
||||
&b2,
|
||||
cntl_sub_gemm( cntl ) );
|
||||
|
||||
// B1 = tril( A11 ) * B1;
|
||||
bl2_trmm_int( BLIS_LEFT,
|
||||
&BLIS_ONE,
|
||||
&a11_pack,
|
||||
&b1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
|
||||
// Copy/unpack B1 (if B1 was packed).
|
||||
bl2_unpackm_int( &b1_pack,
|
||||
&b1,
|
||||
cntl_sub_unpackm_b( cntl ) );
|
||||
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a11_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_ll_blk_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_trmm_ll_blk_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
|
||||
dim_t j;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize objects for packing.
|
||||
bl2_obj_init_pack( &a_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bl2_obj_width_after_trans( *b );
|
||||
|
||||
// Scale B by alpha (if instructed).
|
||||
bl2_scalm_int( alpha,
|
||||
b,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( j = 0; j < n_trans; j += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bl2_determine_blocksize_f( j, n_trans,
|
||||
a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1.
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
j, b_alg, b, &b1 );
|
||||
|
||||
// Copy/pack A (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
a,
|
||||
&a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&b1,
|
||||
&b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// B1 = tril( A ) * B1;
|
||||
bl2_trmm_int( BLIS_LEFT,
|
||||
alpha,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
|
||||
// Copy/unpack B1 (if B1 was packed).
|
||||
bl2_unpackm_int( &b1_pack,
|
||||
&b1,
|
||||
cntl_sub_unpackm_b( cntl ) );
|
||||
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_ll_blk_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T trmm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
trans_t transa,
|
||||
diag_t diag,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_strmm_ll_unb_var1,
|
||||
bl2_ctrmm_ll_unb_var1,
|
||||
bl2_dtrmm_ll_unb_var1,
|
||||
bl2_ztrmm_ll_unb_var1
|
||||
};
|
||||
|
||||
void bl2_trmm_ll_unb_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_a = bl2_obj_datatype( *a );
|
||||
|
||||
trans_t transa = bl2_obj_conjtrans_status( *a );
|
||||
diag_t diag = bl2_obj_diag( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *b );
|
||||
dim_t n = bl2_obj_width( *b );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_a];
|
||||
|
||||
// Invoke the function.
|
||||
f( transa,
|
||||
diag,
|
||||
m,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
ctype* alpha_cast = alpha; \
|
||||
ctype* a_cast = a; \
|
||||
ctype* b_cast = b; \
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* a10t; \
|
||||
ctype* alpha11; \
|
||||
ctype* b0; \
|
||||
ctype* b1; \
|
||||
ctype alpha_alpha11_conj; \
|
||||
dim_t iter, i; \
|
||||
dim_t n_ahead; \
|
||||
conj_t conja; \
|
||||
\
|
||||
if ( bl2_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
conja = bl2_extract_conj( transa ); \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = m - iter - 1; \
|
||||
n_ahead = i; \
|
||||
a10t = a_cast + (i )*rs_a + (0 )*cs_a; \
|
||||
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
|
||||
b0 = b_cast + (0 )*rs_b + (0 )*cs_b; \
|
||||
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = alpha * alpha11 * b1; */ \
|
||||
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
|
||||
\
|
||||
if ( bl2_is_nonunit_diag( diag ) ) \
|
||||
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
|
||||
\
|
||||
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
|
||||
n, \
|
||||
&alpha_alpha11_conj, \
|
||||
b1, cs_b ); \
|
||||
\
|
||||
/* b1 = b1 + alpha * a10t * B0; */ \
|
||||
/* = b1 + alpha * B0^T * a10t^T; */ \
|
||||
PASTEMAC(ch,gemv)( BLIS_TRANSPOSE, \
|
||||
conja, \
|
||||
n_ahead, \
|
||||
n, \
|
||||
alpha_cast, \
|
||||
b0, rs_b, cs_b, \
|
||||
a10t, cs_a, \
|
||||
one, \
|
||||
b1, cs_b ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_ll_unb_var1 )
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_ll_unb_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_ll_unb_var1 )
|
||||
|
||||
@@ -1,159 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T trmm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
trans_t transa,
|
||||
diag_t diag,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_strmm_ll_unb_var2,
|
||||
bl2_ctrmm_ll_unb_var2,
|
||||
bl2_dtrmm_ll_unb_var2,
|
||||
bl2_ztrmm_ll_unb_var2
|
||||
};
|
||||
|
||||
void bl2_trmm_ll_unb_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_a = bl2_obj_datatype( *a );
|
||||
|
||||
trans_t transa = bl2_obj_conjtrans_status( *a );
|
||||
diag_t diag = bl2_obj_diag( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *b );
|
||||
dim_t n = bl2_obj_width( *b );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_a];
|
||||
|
||||
// Invoke the function.
|
||||
f( transa,
|
||||
diag,
|
||||
m,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
ctype* alpha_cast = alpha; \
|
||||
ctype* a_cast = a; \
|
||||
ctype* b_cast = b; \
|
||||
ctype* alpha11; \
|
||||
ctype* a21; \
|
||||
ctype* b1; \
|
||||
ctype* b2; \
|
||||
ctype alpha_alpha11_conj; \
|
||||
dim_t iter, i; \
|
||||
dim_t n_behind; \
|
||||
conj_t conja; \
|
||||
\
|
||||
if ( bl2_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
conja = bl2_extract_conj( transa ); \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = m - iter - 1; \
|
||||
n_behind = iter; \
|
||||
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
|
||||
a21 = a_cast + (i+1)*rs_a + (i )*cs_a; \
|
||||
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
|
||||
b2 = b_cast + (i+1)*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* B2 = B2 + alpha * a21 * b1; */ \
|
||||
PASTEMAC(ch,ger)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_behind, \
|
||||
n, \
|
||||
alpha_cast, \
|
||||
a21, rs_a, \
|
||||
b1, cs_b, \
|
||||
b2, rs_b, cs_b ); \
|
||||
\
|
||||
/* b1 = alpha * alpha11 * b1; */ \
|
||||
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
|
||||
\
|
||||
if ( bl2_is_nonunit_diag( diag ) ) \
|
||||
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
|
||||
\
|
||||
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
|
||||
n, \
|
||||
&alpha_alpha11_conj, \
|
||||
b1, cs_b ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_ll_unb_var2 )
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_ll_unb_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_ll_unb_var2 )
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T trmm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
trans_t transa,
|
||||
diag_t diag,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_strmm_ll_unb_var3,
|
||||
bl2_ctrmm_ll_unb_var3,
|
||||
bl2_dtrmm_ll_unb_var3,
|
||||
bl2_ztrmm_ll_unb_var3
|
||||
};
|
||||
|
||||
void bl2_trmm_ll_unb_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_a = bl2_obj_datatype( *a );
|
||||
|
||||
trans_t transa = bl2_obj_conjtrans_status( *a );
|
||||
diag_t diag = bl2_obj_diag( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *b );
|
||||
dim_t n = bl2_obj_width( *b );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_a];
|
||||
|
||||
// Invoke the function.
|
||||
f( transa,
|
||||
diag,
|
||||
m,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
ctype* alpha_cast = alpha; \
|
||||
ctype* a_cast = a; \
|
||||
ctype* b_cast = b; \
|
||||
ctype* b1; \
|
||||
dim_t j; \
|
||||
\
|
||||
if ( bl2_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
b1 = b_cast + (0 )*rs_b + (j )*cs_b; \
|
||||
\
|
||||
/* b1 = alpha * tril( A ) * b1; */ \
|
||||
PASTEMAC2(ch,ch,trmv)( BLIS_LOWER, \
|
||||
transa, \
|
||||
diag, \
|
||||
m, \
|
||||
alpha_cast, \
|
||||
a_cast, rs_a, cs_a, \
|
||||
b1, rs_b ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_ll_unb_var3 )
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_ll_unb_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_ll_unb_var3 )
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_trmm_lu_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
obj_t a11, a11_pack;
|
||||
obj_t a12;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t b2;
|
||||
|
||||
dim_t ij;
|
||||
dim_t b_alg;
|
||||
dim_t mn;
|
||||
|
||||
// Initialize objects for packing.
|
||||
bl2_obj_init_pack( &a11_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
|
||||
// Query dimension. Since A should be square, any transposition
|
||||
// embedded in the object can be ignored.
|
||||
mn = bl2_obj_length( *a );
|
||||
|
||||
// Scale B by alpha (if instructed).
|
||||
bl2_scalm_int( alpha,
|
||||
b,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bl2_determine_blocksize_f( ij, mn,
|
||||
a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A11 and A12.
|
||||
bl2_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
ij, b_alg, a, &a11 );
|
||||
bl2_acquire_mpart_tl2br( BLIS_SUBPART12,
|
||||
ij, b_alg, a, &a12 );
|
||||
|
||||
// Acquire partitions for B1 and B2.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
ij, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART2,
|
||||
ij, b_alg, b, &b2 );
|
||||
|
||||
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a11,
|
||||
&a11_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&b1,
|
||||
&b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// B1 = triu( A11 ) * B1;
|
||||
bl2_trmm_int( BLIS_LEFT,
|
||||
&BLIS_ONE,
|
||||
&a11_pack,
|
||||
&b1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
|
||||
// B1 = B1 + A12 * B2;
|
||||
bl2_gemm_int( &BLIS_ONE,
|
||||
&a12,
|
||||
&b2,
|
||||
&BLIS_ONE,
|
||||
&b1_pack,
|
||||
cntl_sub_gemm( cntl ) );
|
||||
|
||||
// Copy/unpack B1 (if B1 was packed).
|
||||
bl2_unpackm_int( &b1_pack,
|
||||
&b1,
|
||||
cntl_sub_unpackm_b( cntl ) );
|
||||
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a11_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_lu_blk_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_trmm_lu_blk_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
obj_t a11, a11_pack;
|
||||
obj_t a01;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t b0;
|
||||
|
||||
dim_t ij;
|
||||
dim_t b_alg;
|
||||
dim_t mn;
|
||||
|
||||
// Initialize objects for packing.
|
||||
bl2_obj_init_pack( &a11_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
|
||||
// Query dimension. Since A should be square, any transposition
|
||||
// embedded in the object can be ignored.
|
||||
mn = bl2_obj_length( *a );
|
||||
|
||||
// Scale B by alpha (if instructed).
|
||||
bl2_scalm_int( alpha,
|
||||
b,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bl2_determine_blocksize_f( ij, mn,
|
||||
a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A11 and A01.
|
||||
bl2_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
ij, b_alg, a, &a11 );
|
||||
bl2_acquire_mpart_tl2br( BLIS_SUBPART01,
|
||||
ij, b_alg, a, &a01 );
|
||||
|
||||
// Acquire partitions for B1 and B0.
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
ij, b_alg, b, &b1 );
|
||||
bl2_acquire_mpart_t2b( BLIS_SUBPART0,
|
||||
ij, b_alg, b, &b0 );
|
||||
|
||||
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&a11,
|
||||
&a11_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&b1,
|
||||
&b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// B0 = B0 + A01 * B1;
|
||||
bl2_gemm_int( &BLIS_ONE,
|
||||
&a01,
|
||||
&b1_pack,
|
||||
&BLIS_ONE,
|
||||
&b0,
|
||||
cntl_sub_gemm( cntl ) );
|
||||
|
||||
// B1 = triu( A11 ) * B1;
|
||||
bl2_trmm_int( BLIS_LEFT,
|
||||
&BLIS_ONE,
|
||||
&a11_pack,
|
||||
&b1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
|
||||
// Copy/unpack B1 (if B1 was packed).
|
||||
bl2_unpackm_int( &b1_pack,
|
||||
&b1,
|
||||
cntl_sub_unpackm_b( cntl ) );
|
||||
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a11_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_lu_blk_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_trmm_lu_blk_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
|
||||
dim_t j;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize objects for packing.
|
||||
bl2_obj_init_pack( &a_pack );
|
||||
bl2_obj_init_pack( &b1_pack );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bl2_obj_width_after_trans( *b );
|
||||
|
||||
// Scale B by alpha (if instructed).
|
||||
bl2_scalm_int( alpha,
|
||||
b,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( j = 0; j < n_trans; j += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bl2_determine_blocksize_f( j, n_trans,
|
||||
a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1.
|
||||
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
j, b_alg, b, &b1 );
|
||||
|
||||
// Copy/pack A (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
a,
|
||||
&a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
|
||||
bl2_packm_int( alpha,
|
||||
&b1,
|
||||
&b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// B1 = triu( A ) * B1;
|
||||
bl2_trmm_int( BLIS_LEFT,
|
||||
alpha,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
|
||||
// Copy/unpack B1 (if B1 was packed).
|
||||
bl2_unpackm_int( &b1_pack,
|
||||
&b1,
|
||||
cntl_sub_unpackm_b( cntl ) );
|
||||
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bl2_obj_release_pack( &a_pack );
|
||||
bl2_obj_release_pack( &b1_pack );
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_lu_blk_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T trmm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
trans_t transa,
|
||||
diag_t diag,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_strmm_lu_unb_var1,
|
||||
bl2_ctrmm_lu_unb_var1,
|
||||
bl2_dtrmm_lu_unb_var1,
|
||||
bl2_ztrmm_lu_unb_var1
|
||||
};
|
||||
|
||||
void bl2_trmm_lu_unb_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_a = bl2_obj_datatype( *a );
|
||||
|
||||
trans_t transa = bl2_obj_conjtrans_status( *a );
|
||||
diag_t diag = bl2_obj_diag( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *b );
|
||||
dim_t n = bl2_obj_width( *b );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_a];
|
||||
|
||||
// Invoke the function.
|
||||
f( transa,
|
||||
diag,
|
||||
m,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
ctype* alpha_cast = alpha; \
|
||||
ctype* a_cast = a; \
|
||||
ctype* b_cast = b; \
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* alpha11; \
|
||||
ctype* a12t; \
|
||||
ctype* b1; \
|
||||
ctype* b2; \
|
||||
ctype alpha_alpha11_conj; \
|
||||
dim_t iter, i; \
|
||||
dim_t n_ahead; \
|
||||
conj_t conja; \
|
||||
\
|
||||
if ( bl2_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
conja = bl2_extract_conj( transa ); \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = iter; \
|
||||
n_ahead = m - i - 1; \
|
||||
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
|
||||
a12t = a_cast + (i )*rs_a + (i+1)*cs_a; \
|
||||
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
|
||||
b2 = b_cast + (i+1)*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = alpha * alpha11 * b1; */ \
|
||||
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
|
||||
\
|
||||
if ( bl2_is_nonunit_diag( diag ) ) \
|
||||
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
|
||||
\
|
||||
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
|
||||
n, \
|
||||
&alpha_alpha11_conj, \
|
||||
b1, cs_b ); \
|
||||
\
|
||||
/* b1 = b1 + alpha * a12t * B2; */ \
|
||||
/* = b1 + alpha * B2^T * a12t^T; */ \
|
||||
PASTEMAC(ch,gemv)( BLIS_TRANSPOSE, \
|
||||
conja, \
|
||||
n_ahead, \
|
||||
n, \
|
||||
alpha_cast, \
|
||||
b2, rs_b, cs_b, \
|
||||
a12t, cs_a, \
|
||||
one, \
|
||||
b1, cs_b ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_lu_unb_var1 )
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_lu_unb_var1( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_lu_unb_var1 )
|
||||
|
||||
@@ -1,159 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T trmm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
trans_t transa,
|
||||
diag_t diag,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_strmm_lu_unb_var2,
|
||||
bl2_ctrmm_lu_unb_var2,
|
||||
bl2_dtrmm_lu_unb_var2,
|
||||
bl2_ztrmm_lu_unb_var2
|
||||
};
|
||||
|
||||
void bl2_trmm_lu_unb_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_a = bl2_obj_datatype( *a );
|
||||
|
||||
trans_t transa = bl2_obj_conjtrans_status( *a );
|
||||
diag_t diag = bl2_obj_diag( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *b );
|
||||
dim_t n = bl2_obj_width( *b );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_a];
|
||||
|
||||
// Invoke the function.
|
||||
f( transa,
|
||||
diag,
|
||||
m,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
ctype* alpha_cast = alpha; \
|
||||
ctype* a_cast = a; \
|
||||
ctype* b_cast = b; \
|
||||
ctype* a01; \
|
||||
ctype* alpha11; \
|
||||
ctype* b0; \
|
||||
ctype* b1; \
|
||||
ctype alpha_alpha11_conj; \
|
||||
dim_t iter, i; \
|
||||
dim_t n_behind; \
|
||||
conj_t conja; \
|
||||
\
|
||||
if ( bl2_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
conja = bl2_extract_conj( transa ); \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
a01 = a_cast + (0 )*rs_a + (i )*cs_a; \
|
||||
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
|
||||
b0 = b_cast + (0 )*rs_b + (0 )*cs_b; \
|
||||
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* B0 = B0 + alpha * a01 * b1; */ \
|
||||
PASTEMAC(ch,ger)( conja, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n_behind, \
|
||||
n, \
|
||||
alpha_cast, \
|
||||
a01, rs_a, \
|
||||
b1, cs_b, \
|
||||
b0, rs_b, cs_b ); \
|
||||
\
|
||||
/* b1 = alpha * alpha11 * b1; */ \
|
||||
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
|
||||
\
|
||||
if ( bl2_is_nonunit_diag( diag ) ) \
|
||||
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
|
||||
\
|
||||
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
|
||||
n, \
|
||||
&alpha_alpha11_conj, \
|
||||
b1, cs_b ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_lu_unb_var2 )
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_lu_unb_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_lu_unb_var2 )
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T trmm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
trans_t transa,
|
||||
diag_t diag,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b
|
||||
);
|
||||
|
||||
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
bl2_strmm_lu_unb_var3,
|
||||
bl2_ctrmm_lu_unb_var3,
|
||||
bl2_dtrmm_lu_unb_var3,
|
||||
bl2_ztrmm_lu_unb_var3
|
||||
};
|
||||
|
||||
void bl2_trmm_lu_unb_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_a = bl2_obj_datatype( *a );
|
||||
|
||||
trans_t transa = bl2_obj_conjtrans_status( *a );
|
||||
diag_t diag = bl2_obj_diag( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *b );
|
||||
dim_t n = bl2_obj_width( *b );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
|
||||
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_a];
|
||||
|
||||
// Invoke the function.
|
||||
f( transa,
|
||||
diag,
|
||||
m,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
ctype* alpha_cast = alpha; \
|
||||
ctype* a_cast = a; \
|
||||
ctype* b_cast = b; \
|
||||
ctype* b1; \
|
||||
dim_t j; \
|
||||
\
|
||||
if ( bl2_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
b1 = b_cast + (0 )*rs_b + (j )*cs_b; \
|
||||
\
|
||||
/* b1 = alpha * triu( A ) * b1; */ \
|
||||
PASTEMAC2(ch,ch,trmv)( BLIS_UPPER, \
|
||||
transa, \
|
||||
diag, \
|
||||
m, \
|
||||
alpha_cast, \
|
||||
a_cast, rs_a, cs_a, \
|
||||
b1, rs_b ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_lu_unb_var3 )
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bl2_trmm_lu_unb_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* cntl );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
trans_t transa, \
|
||||
diag_t diag, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_lu_unb_var3 )
|
||||
|
||||
@@ -1,466 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffa,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_u_ker_var2);
|
||||
|
||||
|
||||
void bl2_trmm_u_ker_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
doff_t diagoffa = bl2_obj_diag_offset( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
inc_t ps_b = bl2_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// If beta is a scalar constant, use dt_exec to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
// (If beta is complex with a zero imaginary component, that is reflected
|
||||
// in dt_beta. However, that functionality is not used here.)
|
||||
bl2_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
|
||||
PASTEMAC2(ch,varname,_nr) ]; \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
|
||||
\
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
|
||||
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
|
||||
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict bd_i; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t k_ndup; \
|
||||
dim_t k_a1011; \
|
||||
dim_t off_a1011; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bl2_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* If the diagonal offset is positive, adjust the pointer to B and
|
||||
treat this case as if the diagonal offset were zero. Note that
|
||||
we don't need to adjust the pointer to A since packm would have
|
||||
simply skipped over the regions that were not stored. */ \
|
||||
if ( diagoffa > 0 ) \
|
||||
{ \
|
||||
j = diagoffa; \
|
||||
n = n - j; \
|
||||
diagoffa = 0; \
|
||||
b_cast = b_cast + (j )*rs_b; \
|
||||
} \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
off_a1011 = bl2_max( diagoffa, 0 ); \
|
||||
k_a1011 = k - off_a1011; \
|
||||
k_ndup = k_a1011 * NDUP; \
|
||||
\
|
||||
rstep_a = k * MR; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = MR * rs_c; \
|
||||
cstep_c = NR * cs_c; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Copy the current iteration's NR columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
|
||||
\
|
||||
/* Interior loop. */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = bl2_max( diagoffa_i, 0 ); \
|
||||
k_a1011 = k - off_a1011; \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom edge handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = bl2_max( diagoffa_i, 0 ); \
|
||||
k_a1011 = k - off_a1011; \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
if ( n_left ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Copy the n_left (+ padding) columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
|
||||
\
|
||||
/* Right edge loop. */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = bl2_max( diagoffa_i, 0 ); \
|
||||
k_a1011 = k - off_a1011; \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = bl2_max( diagoffa_i, 0 ); \
|
||||
k_a1011 = k - off_a1011; \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
|
||||
one, \
|
||||
a1, \
|
||||
bd_i, \
|
||||
beta, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, m_left, k ) ) \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_ukr)( k, \
|
||||
one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm, trmm_u_ker_var2 )
|
||||
|
||||
@@ -1,363 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffa,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_l_ker_var2);
|
||||
|
||||
|
||||
void bl2_trsm_l_ker_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
doff_t diagoffa = bl2_obj_diag_offset( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
inc_t ps_b = bl2_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
|
||||
PASTEMAC2(ch,varname,_nr) ]; \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
|
||||
\
|
||||
/* Alias constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
|
||||
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
|
||||
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict bd01; \
|
||||
ctype* restrict bd11; \
|
||||
ctype* restrict bd_i; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_ndup; \
|
||||
dim_t k_a1011; \
|
||||
dim_t k_a10; \
|
||||
dim_t off_a1011, off_b11; \
|
||||
dim_t i, j; \
|
||||
dim_t rstep_a; \
|
||||
dim_t rstep_b, cstep_b; \
|
||||
dim_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* The first thing we do is check the k dimension, which needs to be
|
||||
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
|
||||
This allows us to use a single micro-kernel, which performs an
|
||||
MR x MR triangular solve, even for cases when k isn't actually a
|
||||
multiple of MR. The key is that when A was packed, its edges were
|
||||
first zero padded, and further, the panel that stores the bottom-
|
||||
right corner of the matrix has its diagonal that extendeds into
|
||||
the zero padded region as identity. This allows the trsm of that
|
||||
bottom-right panel to proceed without producing any infs or NaNs
|
||||
or any other numerical funny business that would infect the "good"
|
||||
values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* If the diagonal offset is negative, adjust the pointer to C and
|
||||
treat this case as if the diagonal offset were zero. Note that
|
||||
we don't need to adjust the pointer to A since packm would have
|
||||
simply skipped over the panels that were not stored. */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
k_a1011 = bl2_min( k, diagoffa + m ); \
|
||||
k_ndup = k_a1011 * NDUP; \
|
||||
\
|
||||
off_b11 = diagoffa; \
|
||||
\
|
||||
rstep_a = k * MR; \
|
||||
\
|
||||
rstep_b = NR * MR; \
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
b11 = b1 + (off_b11 )*NR; \
|
||||
\
|
||||
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Copy the current iteration's NR columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bl2_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides below the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is above the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bl2_min( k, diagoffa_i + m_cur ); \
|
||||
k_a10 = k_a1011 - m_cur; \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Compute the addresses of the A10 panel and triangular
|
||||
block A11, and the corresponding panel Bd01 and block
|
||||
Bd11. */ \
|
||||
a10 = a1; \
|
||||
a11 = a1 + k_a10 * MR; \
|
||||
bd01 = bd_i; \
|
||||
bd11 = bd_i + k_a10 * NR * NDUP; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_trsm_ukr)( k_a10, \
|
||||
a10, \
|
||||
a11, \
|
||||
bd01, \
|
||||
bd11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_trsm_ukr)( k_a10, \
|
||||
a10, \
|
||||
a11, \
|
||||
bd01, \
|
||||
bd11, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bd, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
b11 += rstep_b; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trsm, trsm_l_ker_var2 )
|
||||
|
||||
@@ -1,367 +0,0 @@
|
||||
/*
|
||||
libblis
|
||||
An object-based infrastructure for developing high-performance
|
||||
dense linear algebra libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
libblis is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of
|
||||
the License, or (at your option) any later version.
|
||||
|
||||
libblis is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with libblis; if you did not receive a copy, see
|
||||
http://www.gnu.org/licenses/.
|
||||
|
||||
For more information, please contact us at blis@cs.utexas.edu or
|
||||
send mail to:
|
||||
|
||||
Field G. Van Zee and/or
|
||||
Robert A. van de Geijn
|
||||
The University of Texas at Austin
|
||||
Institute for Computational Engineering and Science
|
||||
1 University Station D9500
|
||||
Austin TX 78712
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffa,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_l_ker_var2);
|
||||
|
||||
|
||||
void bl2_trsm_l_ker_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
doff_t diagoffa = bl2_obj_diag_offset( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
inc_t ps_b = bl2_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
|
||||
PASTEMAC2(ch,varname,_nr) ]; \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
|
||||
\
|
||||
/* Alias constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
|
||||
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
|
||||
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict bd01; \
|
||||
ctype* restrict bd11; \
|
||||
ctype* restrict bd_i; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_ndup; \
|
||||
dim_t k_a1011; \
|
||||
dim_t k_a10; \
|
||||
dim_t off_a1011, off_b11; \
|
||||
dim_t i, j; \
|
||||
dim_t rstep_a; \
|
||||
dim_t rstep_b, cstep_b; \
|
||||
dim_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* The first thing we do is check the k dimension, which needs to be
|
||||
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
|
||||
This allows us to use a single micro-kernel, which performs an
|
||||
MR x MR triangular solve, even for cases when k isn't actually a
|
||||
multiple of MR. The key is that when A was packed, its edges were
|
||||
first zero padded, and further, the panel that stores the bottom-
|
||||
right corner of the matrix has its diagonal that extendeds into
|
||||
the zero padded region as identity. This allows the trsm of that
|
||||
bottom-right panel to proceed without producing any infs or NaNs
|
||||
or any other numerical funny business that would infect the "good"
|
||||
values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* If the diagonal offset is negative, adjust the pointer to C and
|
||||
treat this case as if the diagonal offset were zero. Note that
|
||||
we don't need to adjust the pointer to A since packm would have
|
||||
simply skipped over the panels that were not stored. */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
k_a1011 = bl2_min( k, diagoffa + m ); \
|
||||
k_ndup = k_a1011 * NDUP; \
|
||||
\
|
||||
off_b11 = diagoffa; \
|
||||
\
|
||||
rstep_a = k * MR; \
|
||||
\
|
||||
rstep_b = NR * MR; \
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
b11 = b1 + (off_b11 )*NR; \
|
||||
\
|
||||
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Copy the current iteration's NR columns of B to a local buffer
|
||||
with each value duplicated. */ \
|
||||
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bl2_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides below the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is above the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bd. Then compute the length of that panel. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bl2_min( k, diagoffa_i + m_cur ); \
|
||||
k_a10 = k_a1011 - m_cur; \
|
||||
\
|
||||
bd_i = bd + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Compute the addresses of the A10 panel and triangular
|
||||
block A11, and the corresponding panel Bd01 and block
|
||||
Bd11. */ \
|
||||
a10 = a1; \
|
||||
a11 = a1 + k_a10 * MR; \
|
||||
bd01 = bd_i; \
|
||||
bd11 = bd_i + k_a10 * NR * NDUP; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_gemm_ukr)( k_a10, \
|
||||
minus_one, \
|
||||
a10, \
|
||||
bd01, \
|
||||
one, \
|
||||
b11, rs_b, cs_b ); \
|
||||
PASTEMAC2(ch,varname,_trsmonly_ukr)( k_a10, \
|
||||
a11, \
|
||||
bd11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_trsm_ukr)( k_a10, \
|
||||
a10, \
|
||||
a11, \
|
||||
bd01, \
|
||||
bd11, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bd, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bd, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
b11 += rstep_b; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trsm, trsm_l_ker_var2 )
|
||||
|
||||
@@ -1,386 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffa,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_l_ker_var3);
|
||||
|
||||
|
||||
void bl2_trsm_l_ker_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
doff_t diagoffa = bl2_obj_diag_offset( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
inc_t ps_b = bl2_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, trsmukr, gemmukr ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
|
||||
PASTEMAC2(ch,varname,_nr) ]; \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
|
||||
\
|
||||
/* Alias constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
|
||||
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
|
||||
const bool_t DUPB = PASTEMAC2(ch,varname,_dupb); \
|
||||
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict bp01; \
|
||||
ctype* restrict bp11; \
|
||||
ctype* restrict bp_i; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_a1011; \
|
||||
dim_t k_a10; \
|
||||
dim_t off_a1011, off_b11; \
|
||||
dim_t i, j; \
|
||||
dim_t rstep_a; \
|
||||
dim_t rstep_b, cstep_b; \
|
||||
dim_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* The first thing we do is check the k dimension, which needs to be
|
||||
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
|
||||
This allows us to use a single micro-kernel, which performs an
|
||||
MR x MR triangular solve, even for cases when k isn't actually a
|
||||
multiple of MR. The key is that when A was packed, its edges were
|
||||
first zero padded, and further, the panel that stores the bottom-
|
||||
right corner of the matrix has its diagonal that extendeds into
|
||||
the zero padded region as identity. This allows the trsm of that
|
||||
bottom-right panel to proceed without producing any infs or NaNs
|
||||
or any other numerical funny business that would infect the "good"
|
||||
values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* If the diagonal offset is negative, adjust the pointer to C and
|
||||
treat this case as if the diagonal offset were zero. Note that
|
||||
we don't need to adjust the pointer to A since packm would have
|
||||
simply skipped over the panels that were not stored. */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_a1011 = bl2_min( k, diagoffa + m ); \
|
||||
k_nr = k_a1011 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * MR; \
|
||||
\
|
||||
rstep_b = NR * MR; \
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
off_b11 = diagoffa; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
b11 = b1 + (off_b11 )*NR; \
|
||||
\
|
||||
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bd ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bl2_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides below the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is above the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bp. Then compute the length of that panel. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bl2_min( k, diagoffa_i + MR ); \
|
||||
k_a10 = k_a1011 - MR; \
|
||||
\
|
||||
bp_i = bp + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Compute the addresses of the A10 panel and triangular
|
||||
block A11, and the corresponding panel Bd01 and block
|
||||
Bd11. */ \
|
||||
a10 = a1; \
|
||||
a11 = a1 + k_a10 * MR; \
|
||||
bp01 = bp_i; \
|
||||
bp11 = bp_i + k_a10 * NR * NDUP; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm and trsm micro-kernels. */ \
|
||||
PASTEMAC(ch,gemmukr)( k_a10, \
|
||||
minus_one, \
|
||||
a10, \
|
||||
bp01, \
|
||||
one, \
|
||||
b11, rs_b, cs_b ); \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bp11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm and trsm micro-kernels. */ \
|
||||
PASTEMAC(ch,gemmukr)( k_a10, \
|
||||
minus_one, \
|
||||
a10, \
|
||||
bp01, \
|
||||
one, \
|
||||
b11, rs_b, cs_b ); \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bp11, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
b11 += rstep_b; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( trsm_l_ker_var3, TRSM_L_UKERNEL, GEMM_UKERNEL )
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Define macro-kernel blocksizes.
|
||||
//
|
||||
// NOTE: These MR and NR values below MUST match the values that packm uses
|
||||
// when initializing its control tree node.
|
||||
//
|
||||
|
||||
#define bl2_strsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_strsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_S
|
||||
#define bl2_strsm_l_ker_var3_kc BLIS_DEFAULT_KC_S
|
||||
#define bl2_strsm_l_ker_var3_mr BLIS_DEFAULT_MR_S
|
||||
#define bl2_strsm_l_ker_var3_nr BLIS_DEFAULT_NR_S
|
||||
|
||||
#define bl2_dtrsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_dtrsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_D
|
||||
#define bl2_dtrsm_l_ker_var3_kc BLIS_DEFAULT_KC_D
|
||||
#define bl2_dtrsm_l_ker_var3_mr BLIS_DEFAULT_MR_D
|
||||
#define bl2_dtrsm_l_ker_var3_nr BLIS_DEFAULT_NR_D
|
||||
|
||||
#define bl2_ctrsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_ctrsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define bl2_ctrsm_l_ker_var3_kc BLIS_DEFAULT_KC_C
|
||||
#define bl2_ctrsm_l_ker_var3_mr BLIS_DEFAULT_MR_C
|
||||
#define bl2_ctrsm_l_ker_var3_nr BLIS_DEFAULT_NR_C
|
||||
|
||||
#define bl2_ztrsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_ztrsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_Z
|
||||
#define bl2_ztrsm_l_ker_var3_kc BLIS_DEFAULT_KC_Z
|
||||
#define bl2_ztrsm_l_ker_var3_mr BLIS_DEFAULT_MR_Z
|
||||
#define bl2_ztrsm_l_ker_var3_nr BLIS_DEFAULT_NR_Z
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bl2_trsm_l_ker_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_ker_var3 )
|
||||
|
||||
@@ -1,388 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffa,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_u_ker_var3);
|
||||
|
||||
|
||||
void bl2_trsm_u_ker_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bl2_obj_execution_datatype( *c );
|
||||
|
||||
doff_t diagoffa = bl2_obj_diag_offset( *a );
|
||||
|
||||
dim_t m = bl2_obj_length( *c );
|
||||
dim_t n = bl2_obj_width( *c );
|
||||
dim_t k = bl2_obj_width( *a );
|
||||
|
||||
void* buf_a = bl2_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bl2_obj_row_stride( *a );
|
||||
inc_t cs_a = bl2_obj_col_stride( *a );
|
||||
inc_t ps_a = bl2_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bl2_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bl2_obj_row_stride( *b );
|
||||
inc_t cs_b = bl2_obj_col_stride( *b );
|
||||
inc_t ps_b = bl2_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bl2_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bl2_obj_row_stride( *c );
|
||||
inc_t cs_c = bl2_obj_col_stride( *c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, trsmukr, gemmukr ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
|
||||
PASTEMAC2(ch,varname,_nr) * \
|
||||
PASTEMAC2(ch,varname,_ndup) ]; \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
|
||||
PASTEMAC2(ch,varname,_nr) ]; \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
|
||||
\
|
||||
/* Alias constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
|
||||
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
|
||||
const bool_t DUPB = PASTEMAC2(ch,varname,_dupb); \
|
||||
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a12; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict bp21; \
|
||||
ctype* restrict bp11; \
|
||||
ctype* restrict bp_i; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_a1112; \
|
||||
dim_t k_a11, k_a12; \
|
||||
dim_t off_a1112, off_b11; \
|
||||
dim_t i, j, ib; \
|
||||
dim_t rstep_a; \
|
||||
dim_t rstep_b, cstep_b; \
|
||||
dim_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bl2_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bl2_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* The first thing we do is check the k dimension, which needs to be
|
||||
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
|
||||
This allows us to use a single micro-kernel, which performs an
|
||||
MR x MR triangular solve, even for cases when k isn't actually a
|
||||
multiple of MR. The key is that when A was packed, its edges were
|
||||
first zero padded, and further, the panel that stores the bottom-
|
||||
right corner of the matrix has its diagonal that extendeds into
|
||||
the zero padded region as identity. This allows the trsm of that
|
||||
bottom-right panel to proceed without producing any infs or NaNs
|
||||
or any other numerical funny business that would infect the "good"
|
||||
values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* If the diagonal offset is positive, adjust the pointer to B and
|
||||
treat this case as if the diagonal offset were zero. Note that
|
||||
we don't need to adjust the pointer to A since packm would have
|
||||
simply skipped over the panels that were not stored. */ \
|
||||
if ( diagoffa > 0 ) \
|
||||
{ \
|
||||
j = diagoffa; \
|
||||
k = k - j; \
|
||||
diagoffa = 0; \
|
||||
b_cast = b_cast + (j )*rs_b; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_a1112 = k; \
|
||||
k_nr = k_a1112 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * MR; \
|
||||
\
|
||||
rstep_b = NR * MR; \
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
off_b11 = 0; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (m_iter-1)*rstep_c; \
|
||||
b11 = b1 + (m_iter-1)*rstep_b; \
|
||||
\
|
||||
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bd ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( ib = 0; ib < m_iter; ++ib ) \
|
||||
{ \
|
||||
i = m_iter - 1 - ib; \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bl2_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides above the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is below the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in bp. Then compute the length of that panel. */ \
|
||||
off_a1112 = bl2_max( diagoffa_i, 0 ); \
|
||||
k_a1112 = k - off_a1112;; \
|
||||
k_a12 = k_a1112 - MR; \
|
||||
k_a11 = MR; \
|
||||
\
|
||||
bp_i = bp + off_a1112 * NR * NDUP; \
|
||||
\
|
||||
/* Compute the addresses of the A12 panel and triangular
|
||||
block A11, and the corresponding panel Bd21 and block
|
||||
Bd11. */ \
|
||||
a11 = a1; \
|
||||
a12 = a1 + k_a11 * MR; \
|
||||
bp11 = bp_i; \
|
||||
bp21 = bp_i + k_a11 * NR * NDUP; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm and trsm micro-kernels. */ \
|
||||
PASTEMAC(ch,gemmukr)( k_a12, \
|
||||
minus_one, \
|
||||
a12, \
|
||||
bp21, \
|
||||
one, \
|
||||
b11, rs_b, cs_b ); \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bp11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
PASTEMAC(ch,gemmukr)( k_a12, \
|
||||
minus_one, \
|
||||
a12, \
|
||||
bp21, \
|
||||
one, \
|
||||
b11, rs_b, cs_b ); \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bp11, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1112 * MR; \
|
||||
} \
|
||||
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
b11 -= rstep_b; \
|
||||
c11 -= rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( trsm_u_ker_var3, TRSM_U_UKERNEL, GEMM_UKERNEL )
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Define macro-kernel blocksizes.
|
||||
//
|
||||
// NOTE: These MR and NR values below MUST match the values that packm uses
|
||||
// when initializing its control tree node.
|
||||
//
|
||||
|
||||
#define bl2_strsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_strsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_S
|
||||
#define bl2_strsm_u_ker_var3_kc BLIS_DEFAULT_KC_S
|
||||
#define bl2_strsm_u_ker_var3_mr BLIS_DEFAULT_MR_S
|
||||
#define bl2_strsm_u_ker_var3_nr BLIS_DEFAULT_NR_S
|
||||
|
||||
#define bl2_dtrsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_dtrsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_D
|
||||
#define bl2_dtrsm_u_ker_var3_kc BLIS_DEFAULT_KC_D
|
||||
#define bl2_dtrsm_u_ker_var3_mr BLIS_DEFAULT_MR_D
|
||||
#define bl2_dtrsm_u_ker_var3_nr BLIS_DEFAULT_NR_D
|
||||
|
||||
#define bl2_ctrsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_ctrsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define bl2_ctrsm_u_ker_var3_kc BLIS_DEFAULT_KC_C
|
||||
#define bl2_ctrsm_u_ker_var3_mr BLIS_DEFAULT_MR_C
|
||||
#define bl2_ctrsm_u_ker_var3_nr BLIS_DEFAULT_NR_C
|
||||
|
||||
#define bl2_ztrsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
|
||||
#define bl2_ztrsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_Z
|
||||
#define bl2_ztrsm_u_ker_var3_kc BLIS_DEFAULT_KC_Z
|
||||
#define bl2_ztrsm_u_ker_var3_mr BLIS_DEFAULT_MR_Z
|
||||
#define bl2_ztrsm_u_ker_var3_nr BLIS_DEFAULT_NR_Z
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bl2_trsm_u_ker_var3( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffa, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_u_ker_var3 )
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_strsm_l_4x2(
|
||||
float* restrict a11,
|
||||
float* restrict b11,
|
||||
float* restrict bd11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_dtrsm_l_4x2(
|
||||
double* restrict a11,
|
||||
double* restrict b11,
|
||||
double* restrict bd11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
const dim_t rs_a = 1;
|
||||
const dim_t cs_a = 4;
|
||||
|
||||
const dim_t rs_b = 2;
|
||||
const dim_t cs_b = 1;
|
||||
|
||||
const dim_t NDUP = 2;
|
||||
const dim_t inc_bd = cs_b*NDUP;
|
||||
|
||||
double beta00, beta01;
|
||||
double beta10, beta11;
|
||||
double beta20, beta21;
|
||||
double beta30, beta31;
|
||||
|
||||
double alpha00;
|
||||
double alpha10, alpha11;
|
||||
double alpha20, alpha21, alpha22;
|
||||
double alpha30, alpha31, alpha32, alpha33;
|
||||
|
||||
|
||||
beta00 = *(b11 + 0*rs_b + 0*cs_b);
|
||||
beta01 = *(b11 + 0*rs_b + 1*cs_b);
|
||||
beta10 = *(b11 + 1*rs_b + 0*cs_b);
|
||||
beta11 = *(b11 + 1*rs_b + 1*cs_b);
|
||||
beta20 = *(b11 + 2*rs_b + 0*cs_b);
|
||||
beta21 = *(b11 + 2*rs_b + 1*cs_b);
|
||||
beta30 = *(b11 + 3*rs_b + 0*cs_b);
|
||||
beta31 = *(b11 + 3*rs_b + 1*cs_b);
|
||||
|
||||
|
||||
// iteration 0
|
||||
|
||||
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
|
||||
|
||||
beta00 -= 0.0;
|
||||
beta01 -= 0.0;
|
||||
|
||||
beta00 *= alpha00;
|
||||
beta01 *= alpha00;
|
||||
|
||||
*(b11 + 0*rs_b + 0*cs_b) = beta00;
|
||||
*(b11 + 0*rs_b + 1*cs_b) = beta01;
|
||||
*(c11 + 0*rs_c + 0*cs_c) = beta00;
|
||||
*(c11 + 0*rs_c + 1*cs_c) = beta01;
|
||||
|
||||
|
||||
// iteration 1
|
||||
|
||||
alpha10 = *(a11 + 1*rs_a + 0*cs_a);
|
||||
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
|
||||
|
||||
beta10 -= alpha10 * beta00;
|
||||
beta11 -= alpha10 * beta01;
|
||||
|
||||
beta10 *= alpha11;
|
||||
beta11 *= alpha11;
|
||||
|
||||
*(b11 + 1*rs_b + 0*cs_b) = beta10;
|
||||
*(b11 + 1*rs_b + 1*cs_b) = beta11;
|
||||
*(c11 + 1*rs_c + 0*cs_c) = beta10;
|
||||
*(c11 + 1*rs_c + 1*cs_c) = beta11;
|
||||
|
||||
|
||||
// iteration 2
|
||||
|
||||
alpha20 = *(a11 + 2*rs_a + 0*cs_a);
|
||||
alpha21 = *(a11 + 2*rs_a + 1*cs_a);
|
||||
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
|
||||
|
||||
beta20 -= alpha20 * beta00 +
|
||||
alpha21 * beta10;
|
||||
beta21 -= alpha20 * beta01 +
|
||||
alpha21 * beta11;
|
||||
|
||||
beta20 *= alpha22;
|
||||
beta21 *= alpha22;
|
||||
|
||||
*(b11 + 2*rs_b + 0*cs_b) = beta20;
|
||||
*(b11 + 2*rs_b + 1*cs_b) = beta21;
|
||||
*(c11 + 2*rs_c + 0*cs_c) = beta20;
|
||||
*(c11 + 2*rs_c + 1*cs_c) = beta21;
|
||||
|
||||
|
||||
// iteration 3
|
||||
|
||||
alpha30 = *(a11 + 3*rs_a + 0*cs_a);
|
||||
alpha31 = *(a11 + 3*rs_a + 1*cs_a);
|
||||
alpha32 = *(a11 + 3*rs_a + 2*cs_a);
|
||||
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
|
||||
|
||||
beta30 -= alpha30 * beta00 +
|
||||
alpha31 * beta10 +
|
||||
alpha32 * beta20;
|
||||
beta31 -= alpha30 * beta01 +
|
||||
alpha31 * beta11 +
|
||||
alpha32 * beta21;
|
||||
|
||||
beta30 *= alpha33;
|
||||
beta31 *= alpha33;
|
||||
|
||||
*(b11 + 3*rs_b + 0*cs_b) = beta30;
|
||||
*(b11 + 3*rs_b + 1*cs_b) = beta31;
|
||||
*(c11 + 3*rs_c + 0*cs_c) = beta30;
|
||||
*(c11 + 3*rs_c + 1*cs_c) = beta31;
|
||||
|
||||
|
||||
// update bd
|
||||
|
||||
*(bd11 + 0*inc_bd + 0*cs_b) = beta00;
|
||||
*(bd11 + 0*inc_bd + 1*cs_b) = beta00;
|
||||
*(bd11 + 1*inc_bd + 0*cs_b) = beta01;
|
||||
*(bd11 + 1*inc_bd + 1*cs_b) = beta01;
|
||||
*(bd11 + 2*inc_bd + 0*cs_b) = beta10;
|
||||
*(bd11 + 2*inc_bd + 1*cs_b) = beta10;
|
||||
*(bd11 + 3*inc_bd + 0*cs_b) = beta11;
|
||||
*(bd11 + 3*inc_bd + 1*cs_b) = beta11;
|
||||
*(bd11 + 4*inc_bd + 0*cs_b) = beta20;
|
||||
*(bd11 + 4*inc_bd + 1*cs_b) = beta20;
|
||||
*(bd11 + 5*inc_bd + 0*cs_b) = beta21;
|
||||
*(bd11 + 5*inc_bd + 1*cs_b) = beta21;
|
||||
*(bd11 + 6*inc_bd + 0*cs_b) = beta30;
|
||||
*(bd11 + 6*inc_bd + 1*cs_b) = beta30;
|
||||
*(bd11 + 7*inc_bd + 0*cs_b) = beta31;
|
||||
*(bd11 + 7*inc_bd + 1*cs_b) = beta31;
|
||||
|
||||
}
|
||||
|
||||
void bl2_ctrsm_l_4x2(
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_ztrsm_l_4x2(
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* a11, \
|
||||
ctype* b11, \
|
||||
ctype* bd11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_4x2 )
|
||||
|
||||
@@ -1,222 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_strsm_l_4x4(
|
||||
float* restrict a11,
|
||||
float* restrict b11,
|
||||
float* restrict bd11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_dtrsm_l_4x4(
|
||||
double* restrict a11,
|
||||
double* restrict b11,
|
||||
double* restrict bd11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
const dim_t rs_a = 1;
|
||||
const dim_t cs_a = 4;
|
||||
|
||||
const dim_t rs_b = 4;
|
||||
const dim_t cs_b = 1;
|
||||
|
||||
double beta00, beta01, beta02, beta03;
|
||||
double beta10, beta11, beta12, beta13;
|
||||
double beta20, beta21, beta22, beta23;
|
||||
double beta30, beta31, beta32, beta33;
|
||||
|
||||
double alpha00;
|
||||
double alpha10, alpha11;
|
||||
double alpha20, alpha21, alpha22;
|
||||
double alpha30, alpha31, alpha32, alpha33;
|
||||
|
||||
|
||||
beta00 = *(b11 + 0*rs_b + 0*cs_b);
|
||||
beta01 = *(b11 + 0*rs_b + 1*cs_b);
|
||||
beta02 = *(b11 + 0*rs_b + 2*cs_b);
|
||||
beta03 = *(b11 + 0*rs_b + 3*cs_b);
|
||||
beta10 = *(b11 + 1*rs_b + 0*cs_b);
|
||||
beta11 = *(b11 + 1*rs_b + 1*cs_b);
|
||||
beta12 = *(b11 + 1*rs_b + 2*cs_b);
|
||||
beta13 = *(b11 + 1*rs_b + 3*cs_b);
|
||||
beta20 = *(b11 + 2*rs_b + 0*cs_b);
|
||||
beta21 = *(b11 + 2*rs_b + 1*cs_b);
|
||||
beta22 = *(b11 + 2*rs_b + 2*cs_b);
|
||||
beta23 = *(b11 + 2*rs_b + 3*cs_b);
|
||||
beta30 = *(b11 + 3*rs_b + 0*cs_b);
|
||||
beta31 = *(b11 + 3*rs_b + 1*cs_b);
|
||||
beta32 = *(b11 + 3*rs_b + 2*cs_b);
|
||||
beta33 = *(b11 + 3*rs_b + 3*cs_b);
|
||||
|
||||
|
||||
// iteration 0
|
||||
|
||||
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
|
||||
|
||||
beta00 -= 0.0;
|
||||
beta01 -= 0.0;
|
||||
beta02 -= 0.0;
|
||||
beta03 -= 0.0;
|
||||
|
||||
beta00 *= alpha00;
|
||||
beta01 *= alpha00;
|
||||
beta02 *= alpha00;
|
||||
beta03 *= alpha00;
|
||||
|
||||
*(b11 + 0*rs_b + 0*cs_b) = beta00;
|
||||
*(b11 + 0*rs_b + 1*cs_b) = beta01;
|
||||
*(b11 + 0*rs_b + 2*cs_b) = beta02;
|
||||
*(b11 + 0*rs_b + 3*cs_b) = beta03;
|
||||
*(c11 + 0*rs_c + 0*cs_c) = beta00;
|
||||
*(c11 + 0*rs_c + 1*cs_c) = beta01;
|
||||
*(c11 + 0*rs_c + 2*cs_c) = beta02;
|
||||
*(c11 + 0*rs_c + 3*cs_c) = beta03;
|
||||
|
||||
|
||||
// iteration 1
|
||||
|
||||
alpha10 = *(a11 + 1*rs_a + 0*cs_a);
|
||||
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
|
||||
|
||||
beta10 -= alpha10 * beta00;
|
||||
beta11 -= alpha10 * beta01;
|
||||
beta12 -= alpha10 * beta02;
|
||||
beta13 -= alpha10 * beta03;
|
||||
|
||||
beta10 *= alpha11;
|
||||
beta11 *= alpha11;
|
||||
beta12 *= alpha11;
|
||||
beta13 *= alpha11;
|
||||
|
||||
*(b11 + 1*rs_b + 0*cs_b) = beta10;
|
||||
*(b11 + 1*rs_b + 1*cs_b) = beta11;
|
||||
*(b11 + 1*rs_b + 2*cs_b) = beta12;
|
||||
*(b11 + 1*rs_b + 3*cs_b) = beta13;
|
||||
*(c11 + 1*rs_c + 0*cs_c) = beta10;
|
||||
*(c11 + 1*rs_c + 1*cs_c) = beta11;
|
||||
*(c11 + 1*rs_c + 2*cs_c) = beta12;
|
||||
*(c11 + 1*rs_c + 3*cs_c) = beta13;
|
||||
|
||||
|
||||
// iteration 2
|
||||
|
||||
alpha20 = *(a11 + 2*rs_a + 0*cs_a);
|
||||
alpha21 = *(a11 + 2*rs_a + 1*cs_a);
|
||||
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
|
||||
|
||||
beta20 -= alpha20 * beta00 +
|
||||
alpha21 * beta10;
|
||||
beta21 -= alpha20 * beta01 +
|
||||
alpha21 * beta11;
|
||||
beta22 -= alpha20 * beta02 +
|
||||
alpha21 * beta12;
|
||||
beta23 -= alpha20 * beta03 +
|
||||
alpha21 * beta13;
|
||||
|
||||
beta20 *= alpha22;
|
||||
beta21 *= alpha22;
|
||||
beta22 *= alpha22;
|
||||
beta23 *= alpha22;
|
||||
|
||||
*(b11 + 2*rs_b + 0*cs_b) = beta20;
|
||||
*(b11 + 2*rs_b + 1*cs_b) = beta21;
|
||||
*(b11 + 2*rs_b + 2*cs_b) = beta22;
|
||||
*(b11 + 2*rs_b + 3*cs_b) = beta23;
|
||||
*(c11 + 2*rs_c + 0*cs_c) = beta20;
|
||||
*(c11 + 2*rs_c + 1*cs_c) = beta21;
|
||||
*(c11 + 2*rs_c + 2*cs_c) = beta22;
|
||||
*(c11 + 2*rs_c + 3*cs_c) = beta23;
|
||||
|
||||
|
||||
// iteration 3
|
||||
|
||||
alpha30 = *(a11 + 3*rs_a + 0*cs_a);
|
||||
alpha31 = *(a11 + 3*rs_a + 1*cs_a);
|
||||
alpha32 = *(a11 + 3*rs_a + 2*cs_a);
|
||||
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
|
||||
|
||||
beta30 -= alpha30 * beta00 +
|
||||
alpha31 * beta10 +
|
||||
alpha32 * beta20;
|
||||
beta31 -= alpha30 * beta01 +
|
||||
alpha31 * beta11 +
|
||||
alpha32 * beta21;
|
||||
beta32 -= alpha30 * beta02 +
|
||||
alpha31 * beta12 +
|
||||
alpha32 * beta22;
|
||||
beta33 -= alpha30 * beta03 +
|
||||
alpha31 * beta13 +
|
||||
alpha32 * beta23;
|
||||
|
||||
beta30 *= alpha33;
|
||||
beta31 *= alpha33;
|
||||
beta32 *= alpha33;
|
||||
beta33 *= alpha33;
|
||||
|
||||
*(b11 + 3*rs_b + 0*cs_b) = beta30;
|
||||
*(b11 + 3*rs_b + 1*cs_b) = beta31;
|
||||
*(b11 + 3*rs_b + 2*cs_b) = beta32;
|
||||
*(b11 + 3*rs_b + 3*cs_b) = beta33;
|
||||
*(c11 + 3*rs_c + 0*cs_c) = beta30;
|
||||
*(c11 + 3*rs_c + 1*cs_c) = beta31;
|
||||
*(c11 + 3*rs_c + 2*cs_c) = beta32;
|
||||
*(c11 + 3*rs_c + 3*cs_c) = beta33;
|
||||
}
|
||||
|
||||
void bl2_ctrsm_l_4x4(
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_ztrsm_l_4x4(
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* a11, \
|
||||
ctype* b11, \
|
||||
ctype* bd11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_4x4 )
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_strsm_u_4x2(
|
||||
float* restrict a11,
|
||||
float* restrict b11,
|
||||
float* restrict bd11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_dtrsm_u_4x2(
|
||||
double* restrict a11,
|
||||
double* restrict b11,
|
||||
double* restrict bd11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
const dim_t rs_a = 1;
|
||||
const dim_t cs_a = 4;
|
||||
|
||||
const dim_t rs_b = 2;
|
||||
const dim_t cs_b = 1;
|
||||
|
||||
const dim_t NDUP = 2;
|
||||
const dim_t inc_bd = cs_b*NDUP;
|
||||
|
||||
double beta00, beta01;
|
||||
double beta10, beta11;
|
||||
double beta20, beta21;
|
||||
double beta30, beta31;
|
||||
|
||||
double alpha00, alpha01, alpha02, alpha03;
|
||||
double alpha11, alpha12, alpha13;
|
||||
double alpha22, alpha23;
|
||||
double alpha33;
|
||||
|
||||
|
||||
beta00 = *(b11 + 0*rs_b + 0*cs_b);
|
||||
beta01 = *(b11 + 0*rs_b + 1*cs_b);
|
||||
beta10 = *(b11 + 1*rs_b + 0*cs_b);
|
||||
beta11 = *(b11 + 1*rs_b + 1*cs_b);
|
||||
beta20 = *(b11 + 2*rs_b + 0*cs_b);
|
||||
beta21 = *(b11 + 2*rs_b + 1*cs_b);
|
||||
beta30 = *(b11 + 3*rs_b + 0*cs_b);
|
||||
beta31 = *(b11 + 3*rs_b + 1*cs_b);
|
||||
|
||||
|
||||
// iteration 0
|
||||
|
||||
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
|
||||
|
||||
beta30 -= 0.0;
|
||||
beta31 -= 0.0;
|
||||
|
||||
beta30 *= alpha33;
|
||||
beta31 *= alpha33;
|
||||
|
||||
*(b11 + 3*rs_b + 0*cs_b) = beta30;
|
||||
*(b11 + 3*rs_b + 1*cs_b) = beta31;
|
||||
*(c11 + 3*rs_c + 0*cs_c) = beta30;
|
||||
*(c11 + 3*rs_c + 1*cs_c) = beta31;
|
||||
|
||||
|
||||
// iteration 1
|
||||
|
||||
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
|
||||
alpha23 = *(a11 + 2*rs_a + 3*cs_a);
|
||||
|
||||
beta20 -= alpha23 * beta30;
|
||||
beta21 -= alpha23 * beta31;
|
||||
|
||||
beta20 *= alpha22;
|
||||
beta21 *= alpha22;
|
||||
|
||||
*(b11 + 2*rs_b + 0*cs_b) = beta20;
|
||||
*(b11 + 2*rs_b + 1*cs_b) = beta21;
|
||||
*(c11 + 2*rs_c + 0*cs_c) = beta20;
|
||||
*(c11 + 2*rs_c + 1*cs_c) = beta21;
|
||||
|
||||
|
||||
// iteration 2
|
||||
|
||||
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
|
||||
alpha12 = *(a11 + 1*rs_a + 2*cs_a);
|
||||
alpha13 = *(a11 + 1*rs_a + 3*cs_a);
|
||||
|
||||
beta10 -= alpha12 * beta20 +
|
||||
alpha13 * beta30;
|
||||
beta11 -= alpha12 * beta21 +
|
||||
alpha13 * beta31;
|
||||
|
||||
beta10 *= alpha11;
|
||||
beta11 *= alpha11;
|
||||
|
||||
*(b11 + 1*rs_b + 0*cs_b) = beta10;
|
||||
*(b11 + 1*rs_b + 1*cs_b) = beta11;
|
||||
*(c11 + 1*rs_c + 0*cs_c) = beta10;
|
||||
*(c11 + 1*rs_c + 1*cs_c) = beta11;
|
||||
|
||||
|
||||
// iteration 3
|
||||
|
||||
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
|
||||
alpha01 = *(a11 + 0*rs_a + 1*cs_a);
|
||||
alpha02 = *(a11 + 0*rs_a + 2*cs_a);
|
||||
alpha03 = *(a11 + 0*rs_a + 3*cs_a);
|
||||
|
||||
beta00 -= alpha01 * beta10 +
|
||||
alpha02 * beta20 +
|
||||
alpha03 * beta30;
|
||||
beta01 -= alpha01 * beta11 +
|
||||
alpha02 * beta21 +
|
||||
alpha03 * beta31;
|
||||
|
||||
beta00 *= alpha00;
|
||||
beta01 *= alpha00;
|
||||
|
||||
*(b11 + 0*rs_b + 0*cs_b) = beta00;
|
||||
*(b11 + 0*rs_b + 1*cs_b) = beta01;
|
||||
*(c11 + 0*rs_c + 0*cs_c) = beta00;
|
||||
*(c11 + 0*rs_c + 1*cs_c) = beta01;
|
||||
|
||||
|
||||
// update bd
|
||||
|
||||
*(bd11 + 0*inc_bd + 0*cs_b) = beta00;
|
||||
*(bd11 + 0*inc_bd + 1*cs_b) = beta00;
|
||||
*(bd11 + 1*inc_bd + 0*cs_b) = beta01;
|
||||
*(bd11 + 1*inc_bd + 1*cs_b) = beta01;
|
||||
*(bd11 + 2*inc_bd + 0*cs_b) = beta10;
|
||||
*(bd11 + 2*inc_bd + 1*cs_b) = beta10;
|
||||
*(bd11 + 3*inc_bd + 0*cs_b) = beta11;
|
||||
*(bd11 + 3*inc_bd + 1*cs_b) = beta11;
|
||||
*(bd11 + 4*inc_bd + 0*cs_b) = beta20;
|
||||
*(bd11 + 4*inc_bd + 1*cs_b) = beta20;
|
||||
*(bd11 + 5*inc_bd + 0*cs_b) = beta21;
|
||||
*(bd11 + 5*inc_bd + 1*cs_b) = beta21;
|
||||
*(bd11 + 6*inc_bd + 0*cs_b) = beta30;
|
||||
*(bd11 + 6*inc_bd + 1*cs_b) = beta30;
|
||||
*(bd11 + 7*inc_bd + 0*cs_b) = beta31;
|
||||
*(bd11 + 7*inc_bd + 1*cs_b) = beta31;
|
||||
|
||||
}
|
||||
|
||||
void bl2_ctrsm_u_4x2(
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_ztrsm_u_4x2(
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* a11, \
|
||||
ctype* b11, \
|
||||
ctype* bd11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_u_4x2 )
|
||||
|
||||
@@ -1,223 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
void bl2_strsm_u_4x4(
|
||||
float* restrict a11,
|
||||
float* restrict b11,
|
||||
float* restrict bd11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_dtrsm_u_4x4(
|
||||
double* restrict a11,
|
||||
double* restrict b11,
|
||||
double* restrict bd11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
const dim_t rs_a = 1;
|
||||
const dim_t cs_a = 4;
|
||||
|
||||
const dim_t rs_b = 4;
|
||||
const dim_t cs_b = 1;
|
||||
|
||||
double beta00, beta01, beta02, beta03;
|
||||
double beta10, beta11, beta12, beta13;
|
||||
double beta20, beta21, beta22, beta23;
|
||||
double beta30, beta31, beta32, beta33;
|
||||
|
||||
double alpha00, alpha01, alpha02, alpha03;
|
||||
double alpha11, alpha12, alpha13;
|
||||
double alpha22, alpha23;
|
||||
double alpha33;
|
||||
|
||||
|
||||
beta00 = *(b11 + 0*rs_b + 0*cs_b);
|
||||
beta01 = *(b11 + 0*rs_b + 1*cs_b);
|
||||
beta02 = *(b11 + 0*rs_b + 2*cs_b);
|
||||
beta03 = *(b11 + 0*rs_b + 3*cs_b);
|
||||
beta10 = *(b11 + 1*rs_b + 0*cs_b);
|
||||
beta11 = *(b11 + 1*rs_b + 1*cs_b);
|
||||
beta12 = *(b11 + 1*rs_b + 2*cs_b);
|
||||
beta13 = *(b11 + 1*rs_b + 3*cs_b);
|
||||
beta20 = *(b11 + 2*rs_b + 0*cs_b);
|
||||
beta21 = *(b11 + 2*rs_b + 1*cs_b);
|
||||
beta22 = *(b11 + 2*rs_b + 2*cs_b);
|
||||
beta23 = *(b11 + 2*rs_b + 3*cs_b);
|
||||
beta30 = *(b11 + 3*rs_b + 0*cs_b);
|
||||
beta31 = *(b11 + 3*rs_b + 1*cs_b);
|
||||
beta32 = *(b11 + 3*rs_b + 2*cs_b);
|
||||
beta33 = *(b11 + 3*rs_b + 3*cs_b);
|
||||
|
||||
|
||||
// iteration 0
|
||||
|
||||
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
|
||||
|
||||
beta30 -= 0.0;
|
||||
beta31 -= 0.0;
|
||||
beta32 -= 0.0;
|
||||
beta33 -= 0.0;
|
||||
|
||||
beta30 *= alpha33;
|
||||
beta31 *= alpha33;
|
||||
beta32 *= alpha33;
|
||||
beta33 *= alpha33;
|
||||
|
||||
*(b11 + 3*rs_b + 0*cs_b) = beta30;
|
||||
*(b11 + 3*rs_b + 1*cs_b) = beta31;
|
||||
*(b11 + 3*rs_b + 2*cs_b) = beta32;
|
||||
*(b11 + 3*rs_b + 3*cs_b) = beta33;
|
||||
*(c11 + 3*rs_c + 0*cs_c) = beta30;
|
||||
*(c11 + 3*rs_c + 1*cs_c) = beta31;
|
||||
*(c11 + 3*rs_c + 2*cs_c) = beta32;
|
||||
*(c11 + 3*rs_c + 3*cs_c) = beta33;
|
||||
|
||||
|
||||
// iteration 1
|
||||
|
||||
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
|
||||
alpha23 = *(a11 + 2*rs_a + 3*cs_a);
|
||||
|
||||
beta20 -= alpha23 * beta30;
|
||||
beta21 -= alpha23 * beta31;
|
||||
beta22 -= alpha23 * beta32;
|
||||
beta23 -= alpha23 * beta33;
|
||||
|
||||
beta20 *= alpha22;
|
||||
beta21 *= alpha22;
|
||||
beta22 *= alpha22;
|
||||
beta23 *= alpha22;
|
||||
|
||||
*(b11 + 2*rs_b + 0*cs_b) = beta20;
|
||||
*(b11 + 2*rs_b + 1*cs_b) = beta21;
|
||||
*(b11 + 2*rs_b + 2*cs_b) = beta22;
|
||||
*(b11 + 2*rs_b + 3*cs_b) = beta23;
|
||||
*(c11 + 2*rs_c + 0*cs_c) = beta20;
|
||||
*(c11 + 2*rs_c + 1*cs_c) = beta21;
|
||||
*(c11 + 2*rs_c + 2*cs_c) = beta22;
|
||||
*(c11 + 2*rs_c + 3*cs_c) = beta23;
|
||||
|
||||
|
||||
// iteration 2
|
||||
|
||||
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
|
||||
alpha12 = *(a11 + 1*rs_a + 2*cs_a);
|
||||
alpha13 = *(a11 + 1*rs_a + 3*cs_a);
|
||||
|
||||
beta10 -= alpha12 * beta20 +
|
||||
alpha13 * beta30;
|
||||
beta11 -= alpha12 * beta21 +
|
||||
alpha13 * beta31;
|
||||
beta12 -= alpha12 * beta22 +
|
||||
alpha13 * beta32;
|
||||
beta13 -= alpha12 * beta23 +
|
||||
alpha13 * beta33;
|
||||
|
||||
beta10 *= alpha11;
|
||||
beta11 *= alpha11;
|
||||
beta12 *= alpha11;
|
||||
beta13 *= alpha11;
|
||||
|
||||
*(b11 + 1*rs_b + 0*cs_b) = beta10;
|
||||
*(b11 + 1*rs_b + 1*cs_b) = beta11;
|
||||
*(b11 + 1*rs_b + 2*cs_b) = beta12;
|
||||
*(b11 + 1*rs_b + 3*cs_b) = beta13;
|
||||
*(c11 + 1*rs_c + 0*cs_c) = beta10;
|
||||
*(c11 + 1*rs_c + 1*cs_c) = beta11;
|
||||
*(c11 + 1*rs_c + 2*cs_c) = beta12;
|
||||
*(c11 + 1*rs_c + 3*cs_c) = beta13;
|
||||
|
||||
|
||||
// iteration 3
|
||||
|
||||
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
|
||||
alpha01 = *(a11 + 0*rs_a + 1*cs_a);
|
||||
alpha02 = *(a11 + 0*rs_a + 2*cs_a);
|
||||
alpha03 = *(a11 + 0*rs_a + 3*cs_a);
|
||||
|
||||
beta00 -= alpha01 * beta10 +
|
||||
alpha02 * beta20 +
|
||||
alpha03 * beta30;
|
||||
beta01 -= alpha01 * beta11 +
|
||||
alpha02 * beta21 +
|
||||
alpha03 * beta31;
|
||||
beta02 -= alpha01 * beta12 +
|
||||
alpha02 * beta22 +
|
||||
alpha03 * beta32;
|
||||
beta03 -= alpha01 * beta13 +
|
||||
alpha02 * beta23 +
|
||||
alpha03 * beta33;
|
||||
|
||||
beta00 *= alpha00;
|
||||
beta01 *= alpha00;
|
||||
beta02 *= alpha00;
|
||||
beta03 *= alpha00;
|
||||
|
||||
*(b11 + 0*rs_b + 0*cs_b) = beta00;
|
||||
*(b11 + 0*rs_b + 1*cs_b) = beta01;
|
||||
*(b11 + 0*rs_b + 2*cs_b) = beta02;
|
||||
*(b11 + 0*rs_b + 3*cs_b) = beta03;
|
||||
*(c11 + 0*rs_c + 0*cs_c) = beta00;
|
||||
*(c11 + 0*rs_c + 1*cs_c) = beta01;
|
||||
*(c11 + 0*rs_c + 2*cs_c) = beta02;
|
||||
*(c11 + 0*rs_c + 3*cs_c) = beta03;
|
||||
|
||||
}
|
||||
|
||||
void bl2_ctrsm_u_4x4(
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_ztrsm_u_4x4(
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* a11, \
|
||||
ctype* b11, \
|
||||
ctype* bd11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_u_4x4 )
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
@@ -1,127 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis2.h"
|
||||
|
||||
#define NDUP_S BLIS_DEFAULT_NUM_DUPL_S
|
||||
#define NDUP_D BLIS_DEFAULT_NUM_DUPL_D
|
||||
#define NDUP_C BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define NDUP_Z BLIS_DEFAULT_NUM_DUPL_Z
|
||||
|
||||
#define UNROLL_FAC_S 1
|
||||
#define UNROLL_FAC_D 8
|
||||
#define UNROLL_FAC_C 1
|
||||
#define UNROLL_FAC_Z 1
|
||||
|
||||
void bl2_sdupl(
|
||||
dim_t n_elem,
|
||||
float* b,
|
||||
float* bd
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_ddupl(
|
||||
dim_t n_elem,
|
||||
double* b,
|
||||
double* bd
|
||||
)
|
||||
{
|
||||
dim_t n_iter = n_elem / UNROLL_FAC_D;
|
||||
dim_t n_left = n_elem % UNROLL_FAC_D;
|
||||
|
||||
const inc_t rstep_b = UNROLL_FAC_D;
|
||||
const inc_t step_bd = UNROLL_FAC_D * NDUP_D;
|
||||
|
||||
dim_t i;
|
||||
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
*(bd + 0) = *(b + 0);
|
||||
*(bd + 1) = *(b + 0);
|
||||
|
||||
*(bd + 2) = *(b + 1);
|
||||
*(bd + 3) = *(b + 1);
|
||||
|
||||
*(bd + 4) = *(b + 2);
|
||||
*(bd + 5) = *(b + 2);
|
||||
|
||||
*(bd + 6) = *(b + 3);
|
||||
*(bd + 7) = *(b + 3);
|
||||
|
||||
*(bd + 8) = *(b + 4);
|
||||
*(bd + 9) = *(b + 4);
|
||||
|
||||
*(bd + 10) = *(b + 5);
|
||||
*(bd + 11) = *(b + 5);
|
||||
|
||||
*(bd + 12) = *(b + 6);
|
||||
*(bd + 13) = *(b + 6);
|
||||
|
||||
*(bd + 14) = *(b + 7);
|
||||
*(bd + 15) = *(b + 7);
|
||||
|
||||
b += rstep_b;
|
||||
bd += step_bd;
|
||||
}
|
||||
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
*(bd + 0) = *(b + 0);
|
||||
*(bd + 1) = *(b + 0);
|
||||
|
||||
b += 1;
|
||||
bd += NDUP;
|
||||
}
|
||||
}
|
||||
|
||||
void bl2_cdupl(
|
||||
dim_t n_elem,
|
||||
scomplex* b,
|
||||
scomplex* bd
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bl2_zdupl(
|
||||
dim_t n_elem,
|
||||
dcomplex* b,
|
||||
dcomplex* bd
|
||||
)
|
||||
{
|
||||
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n_elem, \
|
||||
ctype* b, \
|
||||
ctype* bd \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( dupl )
|
||||
|
||||
Reference in New Issue
Block a user