Removed several 'old' directories and files.

Details:
- Removed most of the 'old' directories scattered throughout the framework,
  which includes alternate/half-baked/broken implementations.
This commit is contained in:
Field G. Van Zee
2013-03-24 18:49:36 -05:00
parent 551ea4767a
commit 132bffcef7
74 changed files with 0 additions and 10261 deletions

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,273 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
#define FUNCPTR_T copyv_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
dim_t n,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_unb_var1);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_unb_var1);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_unb_var1);
#endif
#endif
void bl2_copyv_unb_var1( obj_t* x,
obj_t* y )
{
num_t dt_x = bl2_obj_datatype( *x );
num_t dt_y = bl2_obj_datatype( *y );
conj_t conjx = bl2_obj_conj_status( *x );
dim_t n = bl2_obj_vector_dim( *x );
inc_t inc_x = bl2_obj_vector_inc( *x );
void* buf_x = bl2_obj_buffer_at_off( *x );
inc_t inc_y = bl2_obj_vector_inc( *y );
void* buf_y = bl2_obj_buffer_at_off( *y );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y];
// Invoke the function.
f( conjx,
n,
buf_x, inc_x,
buf_y, inc_y );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
\
void PASTEMAC2(chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
{ \
ctype_x* x_cast = x; \
ctype_y* y_cast = y; \
ctype_x* chi1; \
ctype_y* psi1; \
dim_t i; \
\
if ( bl2_zero_dim1( n ) ) return; \
\
chi1 = x_cast; \
psi1 = y_cast; \
\
if ( bl2_is_conj( conjx ) ) \
{ \
for ( i = 0; i < n; ++i ) \
{ \
PASTEMAC2(chx,chy,copyjs)( *chi1, *psi1 ); \
\
chi1 += incx; \
psi1 += incy; \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
{ \
PASTEMAC2(chx,chy,copys)( *chi1, *psi1 ); \
\
chi1 += incx; \
psi1 += incy; \
} \
} \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
//INSERT_GENTFUNC2_BASIC( copyv, copyv_unb_var1 )
GENTFUNC2( float, float, s, s, copyv, copyv_unb_var1 )
//GENTFUNC2( double, double, d, d, copyv, copyv_unb_var1 )
GENTFUNC2( scomplex, scomplex, c, c, copyv, copyv_unb_var1 )
GENTFUNC2( dcomplex, dcomplex, z, z, copyv, copyv_unb_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( copyv, copyv_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( copyv, copyv_unb_var1 )
#endif
void bl2_ddcopyv_unb_var1(
conj_t conjx,
dim_t n,
void* x, inc_t incx,
void* y, inc_t incy
)
{
double* restrict x_cast = x;
double* restrict y_cast = y;
double* restrict chi1;
double* restrict psi1;
dim_t i;
//if ( bl2_zero_dim1( n ) ) return;
if ( n == 2 && incx == 1 && incy == 1 )
{
*(y_cast + 0) = *(x_cast + 0);
*(y_cast + 1) = *(x_cast + 1);
return;
}
else if ( n == 4 && incx == 1 && incy == 1 )
{
*(y_cast + 0) = *(x_cast + 0);
*(y_cast + 1) = *(x_cast + 1);
*(y_cast + 2) = *(x_cast + 2);
*(y_cast + 3) = *(x_cast + 3);
return;
}
if ( incx == 1 &&
incy == 1 &&
(unsigned long)x % 16 == 0 &&
(unsigned long)y % 16 == 0 )
{
dim_t n_iter = n / 4;
dim_t n_left = n % 4;
__asm__ volatile
(
"movl %2, %%eax \n\t" // x
"movl %4, %%ebx \n\t" // y
" \n\t"
"movl %3, %%ecx \n\t" // incx
"movl %5, %%edx \n\t" // incy
" \n\t"
"sall $4, %%ecx \n\t" // 16*incx
"sall $4, %%edx \n\t" // 16*incy
" \n\t"
" \n\t"
"movl %0, %%esi \n\t"
"testl %%esi, %%esi \n\t"
"je .CONSIDERKLEFT \n\t"
" \n\t"
" \n\t"
".LOOPKITER: \n\t"
" \n\t"
"movapd (%%eax), %%xmm0 \n\t"
"movapd %%xmm0, (%%ebx) \n\t"
" \n\t"
"movapd (%%eax,%%ecx), %%xmm1 \n\t"
"movapd %%xmm1, (%%ebx,%%edx) \n\t"
" \n\t"
"leal (%%eax,%%ecx,2), %%eax \n\t"
"leal (%%ebx,%%edx,2), %%ebx \n\t"
" \n\t"
" \n\t"
"decl %%esi \n\t"
"jne .LOOPKITER \n\t"
" \n\t"
" \n\t"
" \n\t"
".CONSIDERKLEFT: \n\t"
" \n\t"
"movl %1, %%esi \n\t"
"testl %%esi, %%esi \n\t"
"je .DONE \n\t"
" \n\t"
" \n\t"
"sarl $1, %%ecx \n\t" // 8*incx
"sarl $1, %%edx \n\t" // 8*incy
" \n\t"
".LOOPKLEFT: \n\t"
" \n\t"
"movlpd (%%eax), %%xmm0 \n\t"
"movlpd %%xmm0, (%%ebx) \n\t"
" \n\t"
"addl %%ecx, %%eax \n\t"
"addl %%edx, %%ebx \n\t"
" \n\t"
"decl %%esi \n\t"
"jne .LOOPKLEFT \n\t"
" \n\t"
" \n\t"
".DONE: \n\t"
" \n\t"
: // output operands (none)
: // input operands
"r" (n_iter),
"r" (n_left),
"m" (x),
"m" (incx),
"m" (y),
"m" (incy)
: // register clobber list
"esi", "eax", "ebx", "ecx", "edx",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7"
);
}
else
{
chi1 = x;
psi1 = y;
for ( i = 0; i < n; ++i )
{
bl2_ddcopys( *chi1, *psi1 );
chi1 += incx;
psi1 += incy;
}
}
}

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,78 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
ctype* p_edge = p_begin + (i )*rs_p; \
\
PASTEMAC2(ch,ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( *n_panel != n_panel_max ) \
{ \
dim_t j = *n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p_begin + (j )*cs_p; \
\
PASTEMAC2(ch,ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
/*
if ( rs_p == 1 ) { \
printf( "packm_blk_var2: ps_p = %u\n", ps_p ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: p copied", m_panel_max, n_panel_max, \
p_begin, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
} \
\
if ( rs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: c copied", m_panel_max, n_panel_max, \
p_begin, 1, panel_dim, "%4.1f", "" ); \
} \
\
}
INSERT_GENTFUNC_BASIC( packm, packm_blk_var2 )

View File

@@ -1,134 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
\
if ( bl2_is_lower( uploc ) ) panel_off_i = 0; \
else panel_off_i = bl2_max( 0, diagoffc_i ); \
} \
} \
else \
{ \
panel_len_i = panel_len; \
panel_off_i = 0; \
} \
\
\
c_use = c_begin + panel_off_i*ldc; \
p_use = p_begin + panel_off_i*panel_dim; \
\
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim_i, \
panel_len_i, \
beta_cast, \
c_use, incc, ldc, \
p_use, panel_dim ); \
\
/*
if ( bl2_is_unit_diag( diagc ) ) \
{ \
PASTEMAC2(ch,ch,setd)( diagoffc_i, \
*m_panel, \
*n_panel, \
beta_cast, \
p_begin, rs_p, cs_p ); \
} \
\
if ( bl2_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
bl2_is_upper_or_lower( uploc ) && \
densify == TRUE ) \
{ \
PASTEMAC(ch,packm_densify)( strucc, \
diagoffc_i, \
uploc, \
transc, \
*m_panel, \
*n_panel, \
beta_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
*/ \
\
\
/*
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim_i, \
panel_len, \
beta_cast, \
c_begin, incc, ldc, \
p_begin, panel_dim ); \
*/ \
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This simplifies the
register level micro kernel in that it does not need to support
different register blockings for the edge cases. */ \
if ( *m_panel != m_panel_max ) \
{ \
dim_t m_edge = m_panel_max - *m_panel; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p_begin + (*m_panel )*rs_p; \
\
PASTEMAC2(ch,ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( *n_panel != n_panel_max ) \
{ \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - *n_panel; \
ctype* p_edge = p_begin + (*n_panel )*cs_p; \
\
PASTEMAC2(ch,ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
} \
} \
\
}
INSERT_GENTFUNC_BASIC( packm, packm_blk_var3 )

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,108 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_NO_TRANSPOSE, \
FALSE, \
m_a10, \
n_a10, \
m_max_a10, \
n_max_a10, \
beta, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, ps_pt ); \
\
p_cast += m_max_a10 * n_max_a10; \
} \
\
/* Pack triangle subpartition A11. */ \
{ \
j = n_a10; \
c_begin = c_cast + (0 )*rs_c + (j )*cs_c; \
p_begin = p_cast; \
\
/* This instance of ps_pt is not used by var3. */ \
ps_pt = cs_p * n_max_a11; \
\
PASTEMAC(ch,packm_blk_var3)( BLIS_TRIANGULAR, \
0, \
diagc, \
uploc, \
BLIS_NO_TRANSPOSE, \
densify, \
invdiag, \
m_a11, \
n_a11, \
m_max_a11, \
n_max_a11, \
beta, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, ps_p ); \
\
p_cast += m_max_a11 * n_max_a11; \
} \
\
/* If they exist, pack subpartitions A20 and A21. */ \
if ( m_a2021 ) \
{ \
i = m_a10; \
c_begin = c_cast + (i )*rs_c + (0 )*cs_c; \
p_begin = p_cast; \
\
ps_pt = cs_p * n_max_a2021; \
\
PASTEMAC(ch,packm_blk_var2)( BLIS_GENERAL, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
BLIS_NO_TRANSPOSE, \
FALSE, \
m_a2021, \
n_a2021, \
m_max_a2021, \
n_max_a2021, \
beta, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, ps_pt ); \
} \
\
\
} \
else \
{ \
bl2_abort(); \
} \
\
}
INSERT_GENTFUNC_BASIC( packm, packm_blk_var4 )

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,91 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Pack subpartitions A10 and A11. */ \
{ \
c_begin = c_cast + (0 )*rs_c + (0 )*cs_c; \
p_begin = p_cast; \
\
PASTEMAC(ch,packm_blk_var3)( BLIS_TRIANGULAR, \
diagoffc, \
diagc, \
uploc, \
BLIS_NO_TRANSPOSE, \
densify, \
invdiag, \
revifup, \
reviflo, \
m_a1011, \
n_a1011, \
m_max_a1011, \
n_max_a1011, \
beta, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, ps_p ); \
\
p_cast += step_a1011; \
} \
\
/* If they exist, pack subpartitions A20 and A21. */ \
if ( m_a2021 ) \
{ \
i = m_a1011; \
c_begin = c_cast + (i )*rs_c + (0 )*cs_c; \
p_begin = p_cast; \
\
ps_pt = cs_p * n_max_a2021; \
\
PASTEMAC(ch,packm_blk_var2)( BLIS_GENERAL, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
BLIS_NO_TRANSPOSE, \
FALSE, \
m_a2021, \
n_a2021, \
m_max_a2021, \
n_max_a2021, \
beta, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, ps_pt ); \
} \
} \
else \
{ \
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
} \
\
}
INSERT_GENTFUNC_BASIC( packm, packm_blk_var4 )

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,238 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_sgemm_asm_var2,
bl2_cgemm_asm_var2,
bl2_dgemm_asm_var2,
bl2_zgemm_asm_var2
};
void bl2_gemm_asm_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
num_t dt_a = bl2_obj_datatype( *a );
num_t dt_b = bl2_obj_datatype( *b );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate the m and leading dimensions
// by a factor of two.
/*
if ( bl2_is_complex( dt_a ) && bl2_is_real( dt_b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( m,
n,
k,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_c, rs_c, cs_c );
}
void PASTEMAC(s,gemm_asm_var2)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
void PASTEMAC(c,gemm_asm_var2)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
void PASTEMAC(z,gemm_asm_var2)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
#include "pmmintrin.h"
typedef union
{
__m128d v;
double d[2];
} v2df_t;
#define NOSSE 0
void PASTEMAC(d,gemm_asm_var2)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
double* restrict a_cast = a;
double* restrict b_cast = b;
double* restrict c_cast = c;
double* restrict a1;
double* restrict b1;
double* restrict c1;
double* restrict alpha11;
double* restrict beta11;
double* restrict gamma11;
dim_t i, j, h;
v2df_t b1v;;
v2df_t a1v, a2v;;
v2df_t c1v, c2v;
dim_t m_iter = m / 2;
dim_t m_left = m % 2;
inc_t step_a = 2*rs_a;
inc_t step_c = 2*rs_c;
for ( j = 0; j < n; ++j )
{
c1 = c_cast + (j )* cs_c;
b1 = b_cast + (j )* cs_b;
for ( h = 0; h < k; ++h )
{
a1 = a_cast + (h )*cs_a;
beta11 = b1 + (h )*rs_b;
#if NOSSE
#else
b1v.v = _mm_loaddup_pd( beta11 );
#endif
alpha11 = a1;
gamma11 = c1;
for ( i = 0; i < m_iter; ++i )
{
#if NOSSE
*(gamma11 ) += *beta11 * *(alpha11 );
*(gamma11+1) += *beta11 * *(alpha11+1);
*(gamma11+2) += *beta11 * *(alpha11+2);
*(gamma11+3) += *beta11 * *(alpha11+3);
#else
a1v.v = _mm_load_pd( alpha11 );
//a2v.v = _mm_load_pd( alpha11+2 );
c1v.v = _mm_load_pd( gamma11 );
//c2v.v = _mm_load_pd( gamma11+2 );
c1v.v += b1v.v * a1v.v;
//c2v.v += b1v.v * a2v.v;
_mm_store_pd( gamma11, c1v.v );
//_mm_store_pd( gamma11+2, c2v.v );
#endif
alpha11 += step_a;
gamma11 += step_c;
}
for ( i = 0; i < m_left; ++i )
{
*(gamma11 ) += *beta11 * *(alpha11 );
alpha11 += rs_a;
gamma11 += rs_c;
}
}
}
}

View File

@@ -1,59 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_gemm_asm_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl );
#undef GENPROT
#define GENPROT( chabc, varname ) \
\
void PASTEMAC(chabc,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b, \
void* c, inc_t rs_c, inc_t cs_c \
);
GENPROT( s, gemm_asm_var2 )
GENPROT( d, gemm_asm_var2 )
GENPROT( c, gemm_asm_var2 )
GENPROT( z, gemm_asm_var2 )

View File

@@ -1,318 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_sgemm_asm_var3,
bl2_cgemm_asm_var3,
bl2_dgemm_asm_var3,
bl2_zgemm_asm_var3
};
void bl2_gemm_asm_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
//num_t dt_a = bl2_obj_datatype( *a );
//num_t dt_b = bl2_obj_datatype( *b );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate the m and leading dimensions
// by a factor of two.
/*
if ( bl2_is_complex( dt_a ) && bl2_is_real( dt_b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( m,
n,
k,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_c, rs_c, cs_c );
}
void PASTEMAC(s,gemm_asm_var3)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
void PASTEMAC(c,gemm_asm_var3)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
void PASTEMAC(z,gemm_asm_var3)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
#include "pmmintrin.h"
typedef union
{
__m128d v;
double d[2];
} v2df_t;
#define NOSSE 0
void PASTEMAC(d,gemm_asm_var3)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
double* restrict a_cast = a;
double* restrict b_cast = b;
double* restrict c_cast = c;
double* restrict a1;
double* restrict b1;
double* restrict c1;
double* restrict a11;
double* restrict b11;
double* restrict c11;
double* restrict alpha00;
double* restrict alpha20;
double* restrict beta00;
double* restrict beta01;
double* restrict gamma00;
double* restrict gamma20;
double* restrict gamma01;
double* restrict gamma21;
v2df_t c00v, c01v;
v2df_t c10v, c11v;
v2df_t a0v, a1v;
v2df_t b0v, b1v;
dim_t i, j, h;
dim_t n_iter = n / 2;
dim_t n_left = n % 2;
dim_t m_iter = m / 4;
dim_t m_left = m % 4;
dim_t k_iter = k / 2;
dim_t k_left = k % 2;
b1 = b_cast;
c1 = c_cast;
for ( j = 0; j < n_iter; ++j )
{
a1 = a_cast;
c11 = c1;
for ( i = 0; i < m_iter; ++i )
{
gamma00 = c11 + 0*rs_c + 0*cs_c;
gamma20 = c11 + 2*rs_c + 0*cs_c;
gamma01 = c11 + 0*rs_c + 1*cs_c;
gamma21 = c11 + 2*rs_c + 1*cs_c;
a11 = a1;
b11 = b1;
c00v.v = _mm_load_pd( gamma00 );
c10v.v = _mm_load_pd( gamma20 );
c01v.v = _mm_load_pd( gamma01 );
c11v.v = _mm_load_pd( gamma21 );
for ( h = 0; h < k_iter; ++h )
{
alpha00 = a11;
alpha20 = a11 + 2;
beta00 = b11;
beta01 = b11 + cs_b;
//beta01 = b11 + 100;
a0v.v = _mm_load_pd( alpha00 );
a1v.v = _mm_load_pd( alpha20 );
b0v.v = _mm_loaddup_pd( beta00 );
c00v.v += a0v.v * b0v.v;
c10v.v += a1v.v * b0v.v;
b1v.v = _mm_loaddup_pd( beta01 );
c01v.v += a0v.v * b1v.v;
c11v.v += a1v.v * b1v.v;
a0v.v = _mm_load_pd( alpha00 + cs_a );
a1v.v = _mm_load_pd( alpha20 + cs_a );
//a0v.v = _mm_load_pd( alpha00 + 100 );
//a1v.v = _mm_load_pd( alpha20 + 100 );
b0v.v = _mm_loaddup_pd( beta00 + 1 );
c00v.v += a0v.v * b0v.v;
c10v.v += a1v.v * b0v.v;
b1v.v = _mm_loaddup_pd( beta01 + 1 );
c01v.v += a0v.v * b1v.v;
c11v.v += a1v.v * b1v.v;
a11 += 2*cs_a;
//a11 += 200;
b11 += 2;
}
for ( h = 0; h < k_left; ++h )
{
alpha00 = a11;
alpha20 = a11 + 2;
beta00 = b11;
beta01 = b11 + cs_b;
a0v.v = _mm_load_pd( alpha00 );
a1v.v = _mm_load_pd( alpha20 );
b0v.v = _mm_loaddup_pd( beta00 );
c00v.v += a0v.v * b0v.v;
c10v.v += a1v.v * b0v.v;
b1v.v = _mm_loaddup_pd( beta01 );
c01v.v += a0v.v * b1v.v;
c11v.v += a1v.v * b1v.v;
a11 += cs_a;
b11 += 1;
}
_mm_store_pd( gamma00, c00v.v );
_mm_store_pd( gamma20, c10v.v );
_mm_store_pd( gamma01, c01v.v );
_mm_store_pd( gamma21, c11v.v );
//a1 += 4*rs_a;
//c11 += 4*rs_c;
a1 += 4;
c11 += 4;
}
/*
for ( i = 0; i < m_left; ++i )
{
}
*/
b1 += 2*cs_b;
c1 += 2*cs_c;
}
/*
for ( j = 0; j < n_left ++j )
{
}
*/
}

View File

@@ -1,59 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_gemm_asm_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl );
#undef GENPROT
#define GENPROT( chabc, varname ) \
\
void PASTEMAC(chabc,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b, \
void* c, inc_t rs_c, inc_t cs_c \
);
GENPROT( s, gemm_asm_var3 )
GENPROT( d, gemm_asm_var3 )
GENPROT( c, gemm_asm_var3 )
GENPROT( z, gemm_asm_var3 )

View File

@@ -1,336 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_sgemm_asm_var4,
bl2_cgemm_asm_var4,
bl2_dgemm_asm_var4,
bl2_zgemm_asm_var4
};
void bl2_gemm_asm_var4( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
//num_t dt_a = bl2_obj_datatype( *a );
//num_t dt_b = bl2_obj_datatype( *b );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate the m and leading dimensions
// by a factor of two.
/*
if ( bl2_is_complex( dt_a ) && bl2_is_real( dt_b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( m,
n,
k,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b,
buf_c, rs_c, cs_c );
}
void PASTEMAC(s,gemm_asm_var4)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
void PASTEMAC(c,gemm_asm_var4)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
void PASTEMAC(z,gemm_asm_var4)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
}
#include "pmmintrin.h"
typedef union
{
__m128d v;
double d[2];
} v2df_t;
#define NOSSE 0
void PASTEMAC(d,gemm_asm_var4)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
)
{
double* restrict a_cast = a;
double* restrict b_cast = b;
double* restrict c_cast = c;
double* restrict a1;
double* restrict b1;
double* restrict c1;
double* restrict a11;
double* restrict b11;
double* restrict c11;
double* restrict alpha00;
double* restrict alpha20;
double* restrict beta00;
double* restrict beta01;
double* restrict gamma00;
double* restrict gamma20;
double* restrict gamma01;
double* restrict gamma21;
v2df_t c00v, c01v;
v2df_t c10v, c11v;
v2df_t a0v, a1v;
v2df_t b0v, b1v;
dim_t i, j, h;
dim_t n_iter = n / 2;
dim_t n_left = n % 2;
dim_t m_iter = m / 4;
dim_t m_left = m % 4;
//dim_t k_iter = k / 2;
//dim_t k_left = k % 2;
dim_t k_iter = k / 2;
dim_t k_left = k % 2;
b1 = b_cast;
c1 = c_cast;
for ( j = 0; j < n_iter; ++j )
{
a1 = a_cast;
c11 = c1;
gamma00 = c11;
gamma20 = c11 + 2;
gamma01 = c11 + cs_c;
gamma21 = c11 + 2 + cs_c;
for ( i = 0; i < m_iter; ++i )
{
/*
gamma00 = c11 + 0*rs_c + 0*cs_c;
gamma20 = c11 + 2*rs_c + 0*cs_c;
gamma01 = c11 + 0*rs_c + 1*cs_c;
gamma21 = c11 + 2*rs_c + 1*cs_c;
*/
a11 = a1;
b11 = b1;
c00v.v = _mm_load_pd( gamma00 );
c10v.v = _mm_load_pd( gamma20 );
c01v.v = _mm_load_pd( gamma01 );
c11v.v = _mm_load_pd( gamma21 );
alpha00 = a11;
alpha20 = a11 + 2;
beta00 = b11;
beta01 = b11 + cs_b;
for ( h = 0; h < k_iter; ++h )
{
a0v.v = _mm_load_pd( alpha00 );
a1v.v = _mm_load_pd( alpha20 );
alpha00 += 4;
alpha20 += 4;
b0v.v = _mm_loaddup_pd( beta00 );
beta00 += 1;
c00v.v += a0v.v * b0v.v;
c10v.v += a1v.v * b0v.v;
b1v.v = _mm_loaddup_pd( beta01 );
beta01 += 1;
c01v.v += a0v.v * b1v.v;
c11v.v += a1v.v * b1v.v;
a0v.v = _mm_load_pd( alpha00 );
a1v.v = _mm_load_pd( alpha20 );
alpha00 += 4;
alpha20 += 4;
b0v.v = _mm_loaddup_pd( beta00 );
beta00 += 1;
c00v.v += a0v.v * b0v.v;
c10v.v += a1v.v * b0v.v;
b1v.v = _mm_loaddup_pd( beta01 );
beta01 += 1;
c01v.v += a0v.v * b1v.v;
c11v.v += a1v.v * b1v.v;
//alpha00 += 8;
//alpha20 += 8;
}
for ( h = 0; h < k_left; ++h )
{
a0v.v = _mm_load_pd( alpha00 );
a1v.v = _mm_load_pd( alpha20 );
b0v.v = _mm_loaddup_pd( beta00++ );
c00v.v += a0v.v * b0v.v;
c10v.v += a1v.v * b0v.v;
b1v.v = _mm_loaddup_pd( beta01++ );
c01v.v += a0v.v * b1v.v;
c11v.v += a1v.v * b1v.v;
alpha00 += 4;
alpha20 += 4;
}
_mm_store_pd( gamma00, c00v.v );
_mm_store_pd( gamma20, c10v.v );
_mm_store_pd( gamma01, c01v.v );
_mm_store_pd( gamma21, c11v.v );
//a1 += 4*rs_a;
//c11 += 4*rs_c;
//a1 += 4;
/*
a1 += ps_a;
c11 += 4;
*/
gamma00 += 4;
gamma20 += 4;
gamma01 += 4;
gamma21 += 4;
}
/*
for ( i = 0; i < m_left; ++i )
{
}
*/
b1 += 2*cs_b;
c1 += 2*cs_c;
}
/*
for ( j = 0; j < n_left ++j )
{
}
*/
}

View File

@@ -1,59 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_gemm_asm_var4( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl );
#undef GENPROT
#define GENPROT( chabc, varname ) \
\
void PASTEMAC(chabc,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, \
void* c, inc_t rs_c, inc_t cs_c \
);
GENPROT( s, gemm_asm_var4 )
GENPROT( d, gemm_asm_var4 )
GENPROT( c, gemm_asm_var4 )
GENPROT( z, gemm_asm_var4 )

View File

@@ -1,169 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,gemm_unb_var2);
void bl2_gemm_unb_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate the m and leading dimensions
// by a factor of two.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( m,
n,
k,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
ctype* a1; \
ctype* b1; \
ctype* c1; \
ctype* alpha11; \
ctype* beta11; \
ctype* gamma11; \
ctype rho; \
dim_t i, j, h; \
\
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
c1 = c; \
b1 = b; \
\
for ( j = 0; j < n; ++j ) \
{ \
gamma11 = c1; \
a1 = a; \
\
for ( i = 0; i < m; ++i ) \
{ \
/* gamma11 = c1 + (i )*rs_c + (j )*cs_c; */ \
\
alpha11 = a1; \
beta11 = b1; \
\
PASTEMAC(ch,set0s)( rho ); \
\
for ( h = 0; h < k; ++h ) \
{ \
/* alpha11 = a1 + (i )*rs_a + (h )*cs_a; */ \
/* beta11 = b1 + (h )*rs_b + (j )*cs_b; */ \
\
PASTEMAC(ch,dots)( *alpha11, *beta11, rho ); \
\
alpha11 += cs_a; \
beta11 += rs_b; \
} \
\
PASTEMAC(ch,adds)( rho, *gamma11 ); \
\
gamma11 += rs_c; \
a1 += rs_a; \
} \
\
c1 += cs_c; \
b1 += cs_b; \
} \
}
INSERT_GENTFUNC_BASIC( gemm, gemm_unb_var2 )

View File

@@ -1,56 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_gemm_unb_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b, \
void* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( gemm_unb_var2 )

View File

@@ -1,163 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
void bl2_her2k_l_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* alpha_conj,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
her2k_t* cntl )
{
obj_t a1, a1_pack;
obj_t bh_pack, bhL_pack;
obj_t b1, b1_pack;
obj_t ah_pack, ahL_pack;
obj_t c1;
obj_t c1L, c1L_pack;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offL, nL;
// Initialize all pack objects that are passed into packm_init().
bl2_obj_init_pack( &a1_pack );
bl2_obj_init_pack( &bh_pack );
bl2_obj_init_pack( &b1_pack );
bl2_obj_init_pack( &ah_pack );
bl2_obj_init_pack( &c1L_pack );
// Query dimension in partitioning direction.
m_trans = bl2_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bl2_scalm_int( beta,
c,
cntl_sub_scalm( cntl ) );
// Initialize objects for packing B' and A'.
bl2_packm_init( bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack B' and scale by alpha (if instructed).
bl2_packm_int( alpha,
bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1 and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, b, &b1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding regions
// of Bh_pack and Ah_pack. We compute the width of the subpartition
// taking the location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &bh_pack, &bhL_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &ah_pack, &ahL_pack );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
&b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Perform her2k subproblem.
bl2_her2k_int( alpha,
&a1_pack,
&bhL_pack,
alpha_conj,
&b1_pack,
&ahL_pack,
beta,
&c1L_pack,
cntl_sub_her2k( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1L_pack, &c1L,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
bl2_obj_release_pack( &b1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1L_pack );
}

View File

@@ -1,198 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
extern gemm_t* gemm_cntl_bp_ke;
void bl2_her2k_l_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* alpha_conj,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
her2k_t* cntl )
{
obj_t a1, a1_pack;
obj_t bh_pack, bhL_pack, bhM_pack;
obj_t b1, b1_pack;
obj_t ah_pack, ahL_pack, ahM_pack;
obj_t c1;
obj_t c1L, c1M, c1L_pack, c1M_pack;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offL, nL;
// Initialize all pack objects that are passed into packm_init().
bl2_obj_init_pack( &a1_pack );
bl2_obj_init_pack( &bh_pack );
bl2_obj_init_pack( &b1_pack );
bl2_obj_init_pack( &ah_pack );
bl2_obj_init_pack( &c1L_pack );
// Query dimension in partitioning direction.
m_trans = bl2_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bl2_scalm_int( beta,
c,
cntl_sub_scalm( cntl ) );
// Initialize objects for packing B' and A'.
bl2_packm_init( bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack B' and scale by alpha (if instructed).
bl2_packm_int( alpha,
bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1 and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, b, &b1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding regions
// of Bh_pack and Ah_pack. We compute the width of the subpartition
// taking the location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
// bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
bl2_obj_diag_offset_after_trans( c1 ) );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &bh_pack, &bhL_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &ah_pack, &ahL_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
nL, b_alg, &c1, &c1M );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
nL, b_alg, &bh_pack, &bhM_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
nL, b_alg, &ah_pack, &ahM_pack );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
bl2_packm_init( &c1M, &c1M_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
&b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
if ( bl2_obj_width( c1L ) > 0 )
bl2_packm_int( beta,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1M, &c1M_pack,
cntl_sub_packm_c( cntl ) );
if ( bl2_obj_width( c1L ) > 0 )
{
bl2_gemm_int( alpha,
&a1_pack,
&bhL_pack,
beta,
&c1L_pack,
gemm_cntl_bp_ke );
bl2_gemm_int( alpha_conj,
&b1_pack,
&ahL_pack,
beta,
&c1L_pack,
gemm_cntl_bp_ke );
}
// Perform her2k subproblem.
bl2_her2k_int( alpha,
&a1_pack,
&bhM_pack,
alpha_conj,
&b1_pack,
&ahM_pack,
beta,
&c1M_pack,
cntl_sub_her2k( cntl ) );
// Unpack C1 (if C1 was packed).
if ( bl2_obj_width( c1L ) > 0 )
bl2_unpackm_int( &c1L_pack, &c1L,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
bl2_obj_release_pack( &b1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1L_pack );
}

View File

@@ -1,246 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
void bl2_her2k_l_blk_var4( obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* alpha_conj,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
her2k_t* cntl )
{
obj_t a1, a1_pack;
obj_t bh_pack, bhL_pack;
obj_t b1, b1_pack;
obj_t ah_pack, ahL_pack;
obj_t c1, c1_pack;
obj_t c1L, c1L_pack;
dim_t i;
dim_t bm_alg;
dim_t m_trans;
dim_t offL, nL;
// Initialize all pack objects that are passed into packm_init().
bl2_obj_init_pack( &a1_pack );
bl2_obj_init_pack( &bh_pack );
bl2_obj_init_pack( &b1_pack );
bl2_obj_init_pack( &ah_pack );
bl2_obj_init_pack( &c1_pack );
bl2_obj_init_pack( &c1L_pack );
// Query dimension in partitioning direction.
m_trans = bl2_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bl2_scalm_int( beta,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing B'.
bl2_packm_init( bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
// Initialize object for packing A'.
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Fuse the first iteration with incremental packing and computation.
{
obj_t bh_inc, bh_pack_inc;
obj_t ah_inc, ah_pack_inc;
obj_t c1_pack_inc;
dim_t j;
dim_t bn_inc;
dim_t n_trans;
// Query dimension in partitioning direction.
n_trans = bl2_obj_width( bh_pack );
// Determine the current algorithmic blocksize.
bm_alg = bl2_determine_blocksize_b( 0, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1, and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
0, bm_alg, a, &a1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
0, bm_alg, b, &b1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
0, bm_alg, c, &c1 );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta, &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha, &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj, &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
// Partition along the n dimension.
for ( j = 0; j < n_trans; j += bn_inc )
{
// Determine the current incremental packing blocksize.
bn_inc = bl2_determine_blocksize_f( j, n_trans, a,
cntl_blocksize_aux( cntl ) );
// Acquire incremental partitions.
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
bh, &bh_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&bh_pack, &bh_pack_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&c1_pack, &c1_pack_inc );
// Acquire incremental partitions.
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
ah, &ah_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&ah_pack, &ah_pack_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&c1_pack, &c1_pack_inc );
// Pack Bh_inc and scale by alpha (if instructed).
bl2_packm_int( alpha, &bh_inc, &bh_pack_inc, cntl_sub_packm_b( cntl ) );
// Perform herk subproblem.
bl2_herk_int( &BLIS_ONE,
&a1_pack,
&bh_pack_inc,
&BLIS_ONE,
&c1_pack_inc,
cntl_sub_herk( cntl ) );
// Pack Ah_inc and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj, &ah_inc, &ah_pack_inc, cntl_sub_packm_b( cntl ) );
// Perform herk subproblem.
bl2_herk_int( &BLIS_ONE,
&b1_pack,
&ah_pack_inc,
&BLIS_ONE,
&c1_pack_inc,
cntl_sub_herk( cntl ) );
}
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1_pack, &c1, cntl_sub_unpackm_c( cntl ) );
}
// Partition along the m dimension.
for ( i = bm_alg; i < m_trans; i += bm_alg )
{
// Determine the current algorithmic blocksize.
bm_alg = bl2_determine_blocksize_b( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1, and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, bm_alg, a, &a1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, bm_alg, b, &b1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, bm_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding regions
// of Bh_pack and Ah_pack. We compute the width of the subpartition
// taking the location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
bl2_obj_diag_offset_after_trans( c1 ) + bm_alg );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &bh_pack, &bhL_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &ah_pack, &ahL_pack );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
&b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Perform herk subproblem.
bl2_her2k_int( alpha,
&a1_pack,
&bhL_pack,
alpha_conj,
&b1_pack,
&ahL_pack,
beta,
&c1L_pack,
cntl_sub_her2k( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1L_pack, &c1L,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
bl2_obj_release_pack( &b1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1_pack );
bl2_obj_release_pack( &c1L_pack );
}

View File

@@ -1,245 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
void bl2_her2k_u_blk_var4( obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* alpha_conj,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
her2k_t* cntl )
{
obj_t a1, a1_pack;
obj_t bh_pack, bhR_pack;
obj_t b1, b1_pack;
obj_t ah_pack, ahR_pack;
obj_t c1, c1_pack;
obj_t c1R, c1R_pack;
dim_t i;
dim_t bm_alg;
dim_t m_trans;
dim_t offR, nR;
// Initialize all pack objects that are passed into packm_init().
bl2_obj_init_pack( &a1_pack );
bl2_obj_init_pack( &bh_pack );
bl2_obj_init_pack( &b1_pack );
bl2_obj_init_pack( &ah_pack );
bl2_obj_init_pack( &c1_pack );
bl2_obj_init_pack( &c1R_pack );
// Query dimension in partitioning direction.
m_trans = bl2_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bl2_scalm_int( beta,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing B1'.
bl2_packm_init( bh, &bh_pack,
cntl_sub_packm_b( cntl ) );
// Initialize object for packing A1'.
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Fuse the first iteration with incremental packing and computation.
{
obj_t bh_inc, bh_pack_inc;
obj_t ah_inc, ah_pack_inc;
obj_t c1_pack_inc;
dim_t j;
dim_t bn_inc;
dim_t n_trans;
// Query dimension in partitioning direction.
n_trans = bl2_obj_width( bh_pack );
// Determine the current algorithmic blocksize.
bm_alg = bl2_determine_blocksize_f( 0, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1, and C1.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
0, bm_alg, a, &a1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
0, bm_alg, b, &b1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
0, bm_alg, c, &c1 );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
bl2_packm_init( &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta, &c1, &c1_pack, cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha, &a1, &a1_pack, cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj, &b1, &b1_pack, cntl_sub_packm_a( cntl ) );
// Partition along the n dimension.
for ( j = 0; j < n_trans; j += bn_inc )
{
// Determine the current incremental packing blocksize.
bn_inc = bl2_determine_blocksize_f( j, n_trans, a,
cntl_blocksize_aux( cntl ) );
// Acquire incremental partitions.
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
bh, &bh_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&bh_pack, &bh_pack_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&c1_pack, &c1_pack_inc );
// Acquire incremental partitions.
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
ah, &ah_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&ah_pack, &ah_pack_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc,
&c1_pack, &c1_pack_inc );
// Pack Bh_inc and scale by alpha (if instructed).
bl2_packm_int( alpha, &bh_inc, &bh_pack_inc, cntl_sub_packm_b( cntl ) );
// Perform herk subproblem.
bl2_herk_int( &BLIS_ONE,
&a1_pack,
&bh_pack_inc,
&BLIS_ONE,
&c1_pack_inc,
cntl_sub_herk( cntl ) );
// Pack Ah_inc and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj, &ah_inc, &ah_pack_inc, cntl_sub_packm_b( cntl ) );
// Perform herk subproblem.
bl2_herk_int( &BLIS_ONE,
&b1_pack,
&ah_pack_inc,
&BLIS_ONE,
&c1_pack_inc,
cntl_sub_herk( cntl ) );
}
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1_pack, &c1, cntl_sub_unpackm_c( cntl ) );
}
// Partition along the m dimension.
for ( i = bm_alg; i < m_trans; i += bm_alg )
{
// Determine the current algorithmic blocksize.
bm_alg = bl2_determine_blocksize_f( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1, B1, and C1.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, bm_alg, a, &a1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, bm_alg, b, &b1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, bm_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding regions
// of Bh_pack and Ah_pack. We compute the width of the subpartition
// taking the location of the diagonal into account.
offR = bl2_max( 0, bl2_obj_diag_offset_after_trans( c1 ) );
nR = bl2_obj_width_after_trans( c1 ) - offR;
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &c1, &c1R );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &bh_pack, &bhR_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offR, nR, &ah_pack, &ahR_pack );
// Initialize objects for packing A1, B1, and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1R, &c1R_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack B1 and scale by alpha_conj (if instructed).
bl2_packm_int( alpha_conj,
&b1, &b1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1R, &c1R_pack,
cntl_sub_packm_c( cntl ) );
// Perform herk subproblem.
bl2_her2k_int( alpha,
&a1_pack,
&bhR_pack,
alpha_conj,
&b1_pack,
&ahR_pack,
beta,
&c1R_pack,
cntl_sub_her2k( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1R_pack, &c1R,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &bh_pack );
bl2_obj_release_pack( &b1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1_pack );
bl2_obj_release_pack( &c1R_pack );
}

View File

@@ -1,358 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T her2k_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffc,
uplo_t uploc,
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t ps_a,
void* bh, inc_t ps_bh,
void* b, inc_t ps_b,
void* ah, inc_t ps_ah,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,her2k_u_ker_var3);
void bl2_her2k_u_ker_var3( obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* alpha_conj,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
her2k_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
doff_t diagoffc = bl2_obj_diag_offset( *c );
uplo_t uploc = bl2_obj_uplo( *c );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_bh = bl2_obj_buffer_at_off( *bh );
inc_t ps_bh = bl2_obj_panel_stride( *bh );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t ps_b = bl2_obj_panel_stride( *b );
void* buf_ah = bl2_obj_buffer_at_off( *ah );
inc_t ps_ah = bl2_obj_panel_stride( *ah );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
//cs_a *= 2;
ps_a *= 2;
}
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
uploc,
m,
n,
k,
buf_a, ps_a,
buf_bh, ps_bh,
buf_b, ps_b,
buf_ah, ps_ah,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffc, \
uplo_t uploc, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t ps_a, \
void* bh, inc_t ps_bh, \
void* b, inc_t ps_b, \
void* ah, inc_t ps_ah, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary b buffers for duplicating elements of bh, ah. */ \
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
ctype ad[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
\
/* Temporary c buffer for edge cases. */ \
ctype ct[ PASTEMAC2(ch,varname,_mr) * PASTEMAC2(ch,varname,_nr) ]; \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
\
/* Alias the m and n register blocksizes to shorter names. */ \
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
\
ctype* a_cast = a; \
ctype* bh_cast = bh; \
ctype* b_cast = b; \
ctype* ah_cast = ah; \
ctype* c_cast = c; \
ctype* a1; \
ctype* bh1; \
ctype* b1; \
ctype* ah1; \
ctype* c1; \
ctype* c11; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
\
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = MR * rs_c; \
cstep_c = NR * cs_c; \
\
bh1 = bh_cast; \
ah1 = ah_cast; \
c1 = c_cast; \
\
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
b1 = b_cast; \
c11 = c1; \
\
/* Copy the current iteration's NR columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k, bh1, bd ); \
PASTEMAC2(ch,varname,_dupl)( k, ah1, ad ); \
\
/* Interior loop. */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute in the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
if ( bl2_intersects_diag_n( diagoffc_ij, MR, NR ) ) \
{ \
/* Zero the temporary C buffer. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
a1, \
bd, \
ct, rs_ct, cs_ct ); \
PASTEMAC2(ch,varname,_ukr)( k, \
b1, \
ad, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to only the stored part of C. */ \
PASTEMAC2(ch,ch,adds_mxn_u)( diagoffc_ij, \
MR, NR, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( bl2_is_strictly_above_diag_n( diagoffc_ij, MR, NR ) ) \
{ \
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
a1, \
bd, \
c11, rs_c, cs_c ); \
PASTEMAC2(ch,varname,_ukr)( k, \
b1, \
ad, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
b1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Bottom edge handling. This case never occurs since the bottom
edge is never reached as part of the interior loop. (It is only
updated as part of the bottom-right corner handling below.) */ \
if ( m_left ) \
{ \
; \
} \
\
bh1 += cstep_b; \
ah1 += cstep_b; \
c1 += cstep_c; \
} \
\
if ( n_left ) \
{ \
a1 = a_cast; \
b1 = b_cast; \
c11 = c1; \
\
/* Copy the n_left (+ padding) columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k, bh1, bd ); \
PASTEMAC2(ch,varname,_dupl)( k, ah1, ad ); \
\
/* Right edge loop. (Note that the diagonal is guaranteed not
to factor in here.) */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Zero the temporary C buffer. */ \
PASTEMAC(ch,set0s_mxn)( MR, n_left, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
a1, \
bd, \
ct, rs_ct, cs_ct ); \
PASTEMAC2(ch,varname,_ukr)( k, \
b1, \
ad, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the right edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
\
a1 += rstep_a; \
b1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Compute the diagonal offset one last time. */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
/* Zero the temporary C buffer. */ \
PASTEMAC(ch,set0s_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
a1, \
bd, \
ct, rs_ct, cs_ct ); \
PASTEMAC2(ch,varname,_ukr)( k, \
b1, \
ad, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to only the stored part of C. */ \
PASTEMAC2(ch,ch,adds_mxn_u)( diagoffc_ij, \
m_left, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
}
INSERT_GENTFUNC_BASIC( her2k, her2k_u_ker_var3 )

View File

@@ -1,103 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Default register blocksizes and micro-kernel shapes
//
// NOTE: These MR and NR values below MUST match the values that packm uses
// when initializing its control tree node.
//
#include "bl2_gemm_4x2.h"
#include "bl2_dupl_kx2.h"
#define bl2_sher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_S
#define bl2_sher2k_u_ker_var3_kc BLIS_DEFAULT_KC_S
#define bl2_sher2k_u_ker_var3_mr BLIS_DEFAULT_MR_S
#define bl2_sher2k_u_ker_var3_nr BLIS_DEFAULT_NR_S
#define bl2_sher2k_u_ker_var3_ukr bl2_sgemm_4x2
#define bl2_sher2k_u_ker_var3_dupl bl2_sdupl_kx2
#define bl2_dher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_D
#define bl2_dher2k_u_ker_var3_kc BLIS_DEFAULT_KC_D
#define bl2_dher2k_u_ker_var3_mr BLIS_DEFAULT_MR_D
#define bl2_dher2k_u_ker_var3_nr BLIS_DEFAULT_NR_D
#define bl2_dher2k_u_ker_var3_ukr bl2_dgemm_4x2
#define bl2_dher2k_u_ker_var3_dupl bl2_ddupl_kx2
#define bl2_cher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_C
#define bl2_cher2k_u_ker_var3_kc BLIS_DEFAULT_KC_C
#define bl2_cher2k_u_ker_var3_mr BLIS_DEFAULT_MR_C
#define bl2_cher2k_u_ker_var3_nr BLIS_DEFAULT_NR_C
#define bl2_cher2k_u_ker_var3_ukr bl2_cgemm_4x2
#define bl2_cher2k_u_ker_var3_dupl bl2_cdupl_kx2
#define bl2_zher2k_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_Z
#define bl2_zher2k_u_ker_var3_kc BLIS_DEFAULT_KC_Z
#define bl2_zher2k_u_ker_var3_mr BLIS_DEFAULT_MR_Z
#define bl2_zher2k_u_ker_var3_nr BLIS_DEFAULT_NR_Z
#define bl2_zher2k_u_ker_var3_ukr bl2_zgemm_4x2
#define bl2_zher2k_u_ker_var3_dupl bl2_zdupl_kx2
void bl2_her2k_u_ker_var3( obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* alpha_conj,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
her2k_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffc, \
uplo_t uploc, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t ps_a, \
void* bh, inc_t ps_bh, \
void* b, inc_t ps_b, \
void* ah, inc_t ps_ah, \
void* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( her2k_u_ker_var3 )

View File

@@ -1,133 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
void bl2_herk_l_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* ah,
obj_t* beta,
obj_t* c,
herk_t* cntl )
{
obj_t a1, a1_pack;
obj_t ah_pack, ahL_pack;
obj_t c1;
obj_t c1L, c1L_pack;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offL, nL;
// Initialize all pack objects that are passed into packm_init().
bl2_obj_init_pack( &a1_pack );
bl2_obj_init_pack( &ah_pack );
bl2_obj_init_pack( &c1L_pack );
// Query dimension in partitioning direction.
m_trans = bl2_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bl2_scalm_int( beta,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A'.
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' and scale by alpha (if instructed).
bl2_packm_int( alpha,
ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding region
// of Ah_pack. We compute the width of the subpartition taking the
// location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &ah_pack, &ahL_pack );
// Initialize objects for packing A1 and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Perform herk subproblem.
bl2_herk_int( alpha,
&a1_pack,
&ahL_pack,
beta,
&c1L_pack,
cntl_sub_herk( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1L_pack, &c1L,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1L_pack );
}

View File

@@ -1,160 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
extern gemm_t* gemm_cntl_bp_ke;
void bl2_herk_l_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* ah,
obj_t* beta,
obj_t* c,
herk_t* cntl )
{
obj_t a1, a1_pack;
obj_t ah_pack, ahL_pack, ahM_pack;
obj_t c1, c1M;
obj_t c1L, c1L_pack, c1M_pack;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offL, nL;
// Initialize all pack objects that are passed into packm_init().
bl2_obj_init_pack( &a1_pack );
bl2_obj_init_pack( &ah_pack );
bl2_obj_init_pack( &c1L_pack );
bl2_obj_init_pack( &c1M_pack );
// Query dimension in partitioning direction.
m_trans = bl2_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bl2_scalm_int( beta,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A'.
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' and scale by alpha (if instructed).
bl2_packm_int( alpha,
ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_b( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding region
// of Ah_pack. We compute the width of the subpartition taking the
// location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
bl2_obj_diag_offset_after_trans( c1 ) );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &ah_pack, &ahL_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
nL, b_alg, &c1, &c1M );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
nL, b_alg, &ah_pack, &ahM_pack );
// Initialize objects for packing A1 and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
bl2_packm_init( &c1M, &c1M_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1M, &c1M_pack,
cntl_sub_packm_c( cntl ) );
// Perform herk subproblem.
bl2_gemm_int( alpha,
&a1_pack,
&ahL_pack,
beta,
&c1L_pack,
gemm_cntl_bp_ke );
// Perform herk subproblem.
bl2_herk_int( alpha,
&a1_pack,
&ahM_pack,
beta,
&c1M_pack,
cntl_sub_herk( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1L_pack, &c1L,
cntl_sub_unpackm_c( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1M_pack, &c1M,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1L_pack );
bl2_obj_release_pack( &c1M_pack );
}

View File

@@ -1,143 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
void bl2_herk_l_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* ah,
obj_t* beta,
obj_t* c,
herk_t* cntl )
{
obj_t a1, a1_pack;
obj_t ah_pack, ahL_pack;
obj_t c1;
obj_t c1L, c1L_pack;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offL, nL;
// Initialize all pack objects that are passed into packm_init().
bl2_obj_init_pack( &a1_pack );
bl2_obj_init_pack( &ah_pack );
bl2_obj_init_pack( &c1L_pack );
// Query dimension in partitioning direction.
m_trans = bl2_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bl2_scalm_int( beta,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A'.
bl2_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' and scale by alpha (if instructed).
//bl2_packm_int( alpha,
// ah, &ah_pack,
// cntl_sub_packm_b( cntl ) );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
obj_t ah_inc, ah_pack_inc;
// Determine the current algorithmic blocksize.
b_alg = bl2_determine_blocksize_f( i, m_trans, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding region
// of Ah_pack. We compute the width of the subpartition taking the
// location of the diagonal into account.
offL = 0;
nL = bl2_min( bl2_obj_width_after_trans( c1 ),
bl2_obj_diag_offset_after_trans( c1 ) + b_alg );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &c1, &c1L );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
offL, nL, &ah_pack, &ahL_pack );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
i, b_alg, ah, &ah_inc );
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
i, b_alg, &ah_pack, &ah_pack_inc );
bl2_packm_int( alpha,
&ah_inc, &ah_pack_inc,
cntl_sub_packm_b( cntl ) );
// Initialize objects for packing A1 and C1.
bl2_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bl2_packm_init( &c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Pack A1 and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
// Pack C1 and scale by beta (if instructed).
bl2_packm_int( beta,
&c1L, &c1L_pack,
cntl_sub_packm_c( cntl ) );
// Perform herk subproblem.
bl2_herk_int( alpha,
&a1_pack,
&ahL_pack,
beta,
&c1L_pack,
cntl_sub_herk( cntl ) );
// Unpack C1 (if C1 was packed).
bl2_unpackm_int( &c1L_pack, &c1L,
cntl_sub_unpackm_c( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a1_pack );
bl2_obj_release_pack( &ah_pack );
bl2_obj_release_pack( &c1L_pack );
}

View File

@@ -1,465 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffa,
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,trmm_l_ker_var2);
void bl2_trmm_l_ker_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
doff_t diagoffa = bl2_obj_diag_offset( *a );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
inc_t ps_b = bl2_obj_panel_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// If beta is a scalar constant, use dt_exec to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the beta object and extract the buffer at the beta offset.
// (If beta is complex with a zero imaginary component, that is reflected
// in dt_beta. However, that functionality is not used here.)
bl2_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
m,
n,
k,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
PASTEMAC2(ch,varname,_nr) ]; \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
\
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict bd_i; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t k_ndup; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* If the diagonal offset is negative, adjust the pointer to C and
treat this case as if the diagonal offset were zero. Note that
we don't need to adjust the pointer to A since packm would have
simply skipped over the panels that were not stored. */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
k_a1011 = bl2_min( k, diagoffa + m ); \
k_ndup = k_a1011 * NDUP; \
\
rstep_a = k * MR; \
\
cstep_b = ps_b; \
\
rstep_c = MR * rs_c; \
cstep_c = NR * cs_c; \
\
b1 = b_cast; \
c1 = c_cast; \
\
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* Copy the current iteration's NR columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
\
/* Interior loop. */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = 0; \
k_a1011 = bl2_min( k, diagoffa_i + MR ); \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
c11, rs_c, cs_c ); \
\
a1 += k_a1011 * MR; \
} \
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
c11, rs_c, cs_c ); \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
/* Bottom edge handling. */ \
if ( m_left ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = 0; \
k_a1011 = bl2_min( k, diagoffa_i + m_left ); \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
if ( n_left ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* Copy the n_left (+ padding) columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
\
/* Right edge loop. */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = 0; \
k_a1011 = bl2_min( k, diagoffa_i + MR ); \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
\
a1 += k_a1011 * MR; \
} \
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = 0; \
k_a1011 = bl2_min( k, diagoffa_i + m_left ); \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_l_ker_var2 )

View File

@@ -1,218 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Bottom edge handling. */ \
if ( m_left ) \
{ \
/* Use the diagonal offset for the current panel of A to compute
k_use <= k so that we minimize the number of flops with zeros
(ie: when the current panel intersects the diagonal). */ \
diagoffa_i = diagoffa + (doff_t)i*MR; \
k_diag = diagoffa_i + MR; \
if ( k_diag < 0 ) k_use = 0; \
else if ( k_diag > k ) k_use = k; \
else k_use = k_diag; \
\
/* If the current panel intersects the diagonal, we need to
scale by beta. (When the the current function is invoked as
part of classic trmm, beta will be zero, and when invoked as
part of trmm3, beta will be non-zero). If the current panel
does not intersect the diagonal (but still has non-zero
elements), we accumulate into C (for both trmm and trmm3). */ \
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_use, \
a1, \
bd, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( k_use != 0 ) \
{ \
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_use, \
a1, \
bd, \
zero, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
if ( n_left ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* Copy the n_left (+ padding) columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k, b1, bd ); \
\
/* Right edge loop. */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Use the diagonal offset for the current panel of A to compute
k_use <= k so that we minimize the number of flops with zeros
(ie: when the current panel intersects the diagonal). */ \
diagoffa_i = diagoffa + (doff_t)i*MR; \
k_diag = diagoffa_i + MR; \
if ( k_diag < 0 ) k_use = 0; \
else if ( k_diag > k ) k_use = k; \
else k_use = k_diag; \
\
/* If the current panel intersects the diagonal, we need to
scale by beta. (When the the current function is invoked as
part of classic trmm, beta will be zero, and when invoked as
part of trmm3, beta will be non-zero). If the current panel
does not intersect the diagonal (but still has non-zero
elements), we accumulate into C (for both trmm and trmm3). */ \
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_use, \
a1, \
bd, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the right edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( k_use != 0 ) \
{ \
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_use, \
a1, \
bd, \
zero, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the right edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
/* Use the diagonal offset for the current panel of A to compute
k_use <= k so that we minimize the number of flops with zeros
(ie: when the current panel intersects the diagonal). */ \
diagoffa_i = diagoffa + (doff_t)i*MR; \
k_diag = diagoffa_i + MR; \
if ( k_diag < 0 ) k_use = 0; \
else if ( k_diag > k ) k_use = k; \
else k_use = k_diag; \
\
/* If the current panel intersects the diagonal, we need to
scale by beta. (When the the current function is invoked as
part of classic trmm, beta will be zero, and when invoked as
part of trmm3, beta will be non-zero). If the current panel
does not intersect the diagonal (but still has non-zero
elements), we accumulate into C (for both trmm and trmm3). */ \
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_use, \
a1, \
bd, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom-right corner of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( k_use != 0 ) \
{ \
/* Invoke the micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_use, \
a1, \
bd, \
zero, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the bottom-right corner of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_l_ker_var2 )

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,126 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_trmm_ll_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
obj_t a11, a11_pack;
obj_t a10;
obj_t b1, b1_pack;
obj_t b0;
dim_t ij;
dim_t b_alg;
dim_t mn;
// Initialize objects for packing.
bl2_obj_init_pack( &a11_pack );
bl2_obj_init_pack( &b1_pack );
// Query dimension. Since A should be square, any transposition
// embedded in the object can be ignored.
mn = bl2_obj_length( *a );
// Scale B by alpha (if instructed).
bl2_scalm_int( alpha,
b,
cntl_sub_scalm( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bl2_determine_blocksize_b( ij, mn,
a,
cntl_blocksize( cntl ) );
// Acquire partitions for A11 and A10.
bl2_acquire_mpart_br2tl( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bl2_acquire_mpart_br2tl( BLIS_SUBPART10,
ij, b_alg, a, &a10 );
// Acquire partitions for B1 and B0.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
ij, b_alg, b, &b1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART0,
ij, b_alg, b, &b0 );
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a11,
&a11_pack,
cntl_sub_packm_a( cntl ) );
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&b1,
&b1_pack,
cntl_sub_packm_b( cntl ) );
// B1 = tril( A11 ) * B1;
bl2_trmm_int( BLIS_LEFT,
&BLIS_ONE,
&a11_pack,
&b1_pack,
cntl_sub_trmm( cntl ) );
// B1 = B1 + A10 * B0;
bl2_gemm_int( &BLIS_ONE,
&a10,
&b0,
&BLIS_ONE,
&b1_pack,
cntl_sub_gemm( cntl ) );
// Copy/unpack B1 (if B1 was packed).
bl2_unpackm_int( &b1_pack,
&b1,
cntl_sub_unpackm_b( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a11_pack );
bl2_obj_release_pack( &b1_pack );
}

View File

@@ -1,39 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_ll_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );

View File

@@ -1,126 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_trmm_ll_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
obj_t a11, a11_pack;
obj_t a21;
obj_t b1, b1_pack;
obj_t b2;
dim_t ij;
dim_t b_alg;
dim_t mn;
// Initialize objects for packing.
bl2_obj_init_pack( &a11_pack );
bl2_obj_init_pack( &b1_pack );
// Query dimension. Since A should be square, any transposition
// embedded in the object can be ignored.
mn = bl2_obj_length( *a );
// Scale B by alpha (if instructed).
bl2_scalm_int( alpha,
b,
cntl_sub_scalm( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bl2_determine_blocksize_b( ij, mn,
a,
cntl_blocksize( cntl ) );
// Acquire partitions for A11 and A01.
bl2_acquire_mpart_br2tl( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bl2_acquire_mpart_br2tl( BLIS_SUBPART21,
ij, b_alg, a, &a21 );
// Acquire partitions for B1 and B0.
bl2_acquire_mpart_b2t( BLIS_SUBPART1,
ij, b_alg, b, &b1 );
bl2_acquire_mpart_b2t( BLIS_SUBPART2,
ij, b_alg, b, &b2 );
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a11,
&a11_pack,
cntl_sub_packm_a( cntl ) );
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&b1,
&b1_pack,
cntl_sub_packm_b( cntl ) );
// B2 = B2 + A21 * B1;
bl2_gemm_int( &BLIS_ONE,
&a21,
&b1_pack,
&BLIS_ONE,
&b2,
cntl_sub_gemm( cntl ) );
// B1 = tril( A11 ) * B1;
bl2_trmm_int( BLIS_LEFT,
&BLIS_ONE,
&a11_pack,
&b1_pack,
cntl_sub_trmm( cntl ) );
// Copy/unpack B1 (if B1 was packed).
bl2_unpackm_int( &b1_pack,
&b1,
cntl_sub_unpackm_b( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a11_pack );
bl2_obj_release_pack( &b1_pack );
}

View File

@@ -1,39 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_ll_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );

View File

@@ -1,107 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_trmm_ll_blk_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
obj_t a_pack;
obj_t b1, b1_pack;
dim_t j;
dim_t b_alg;
dim_t n_trans;
// Initialize objects for packing.
bl2_obj_init_pack( &a_pack );
bl2_obj_init_pack( &b1_pack );
// Query dimension in partitioning direction.
n_trans = bl2_obj_width_after_trans( *b );
// Scale B by alpha (if instructed).
bl2_scalm_int( alpha,
b,
cntl_sub_scalm( cntl ) );
// Partition along the n dimension.
for ( j = 0; j < n_trans; j += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bl2_determine_blocksize_f( j, n_trans,
a,
cntl_blocksize( cntl ) );
// Acquire partitions for B1.
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
j, b_alg, b, &b1 );
// Copy/pack A (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
a,
&a_pack,
cntl_sub_packm_a( cntl ) );
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&b1,
&b1_pack,
cntl_sub_packm_b( cntl ) );
// B1 = tril( A ) * B1;
bl2_trmm_int( BLIS_LEFT,
alpha,
&a_pack,
&b1_pack,
cntl_sub_trmm( cntl ) );
// Copy/unpack B1 (if B1 was packed).
bl2_unpackm_int( &b1_pack,
&b1,
cntl_sub_unpackm_b( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a_pack );
bl2_obj_release_pack( &b1_pack );
}

View File

@@ -1,39 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_ll_blk_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );

View File

@@ -1,162 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T trmm_fp
typedef void (*FUNCPTR_T)(
trans_t transa,
diag_t diag,
dim_t m,
dim_t n,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_strmm_ll_unb_var1,
bl2_ctrmm_ll_unb_var1,
bl2_dtrmm_ll_unb_var1,
bl2_ztrmm_ll_unb_var1
};
void bl2_trmm_ll_unb_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
num_t dt_a = bl2_obj_datatype( *a );
trans_t transa = bl2_obj_conjtrans_status( *a );
diag_t diag = bl2_obj_diag( *a );
dim_t m = bl2_obj_length( *b );
dim_t n = bl2_obj_width( *b );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a];
// Invoke the function.
f( transa,
diag,
m,
n,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype* alpha_cast = alpha; \
ctype* a_cast = a; \
ctype* b_cast = b; \
ctype* one = PASTEMAC(ch,1); \
ctype* a10t; \
ctype* alpha11; \
ctype* b0; \
ctype* b1; \
ctype alpha_alpha11_conj; \
dim_t iter, i; \
dim_t n_ahead; \
conj_t conja; \
\
if ( bl2_zero_dim2( m, n ) ) return; \
\
conja = bl2_extract_conj( transa ); \
\
for ( iter = 0; iter < m; ++iter ) \
{ \
i = m - iter - 1; \
n_ahead = i; \
a10t = a_cast + (i )*rs_a + (0 )*cs_a; \
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
b0 = b_cast + (0 )*rs_b + (0 )*cs_b; \
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
\
/* b1 = alpha * alpha11 * b1; */ \
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
\
if ( bl2_is_nonunit_diag( diag ) ) \
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
\
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
n, \
&alpha_alpha11_conj, \
b1, cs_b ); \
\
/* b1 = b1 + alpha * a10t * B0; */ \
/* = b1 + alpha * B0^T * a10t^T; */ \
PASTEMAC(ch,gemv)( BLIS_TRANSPOSE, \
conja, \
n_ahead, \
n, \
alpha_cast, \
b0, rs_b, cs_b, \
a10t, cs_a, \
one, \
b1, cs_b ); \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_ll_unb_var1 )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_ll_unb_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT_BASIC( trmm_ll_unb_var1 )

View File

@@ -1,159 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T trmm_fp
typedef void (*FUNCPTR_T)(
trans_t transa,
diag_t diag,
dim_t m,
dim_t n,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_strmm_ll_unb_var2,
bl2_ctrmm_ll_unb_var2,
bl2_dtrmm_ll_unb_var2,
bl2_ztrmm_ll_unb_var2
};
void bl2_trmm_ll_unb_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
num_t dt_a = bl2_obj_datatype( *a );
trans_t transa = bl2_obj_conjtrans_status( *a );
diag_t diag = bl2_obj_diag( *a );
dim_t m = bl2_obj_length( *b );
dim_t n = bl2_obj_width( *b );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a];
// Invoke the function.
f( transa,
diag,
m,
n,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype* alpha_cast = alpha; \
ctype* a_cast = a; \
ctype* b_cast = b; \
ctype* alpha11; \
ctype* a21; \
ctype* b1; \
ctype* b2; \
ctype alpha_alpha11_conj; \
dim_t iter, i; \
dim_t n_behind; \
conj_t conja; \
\
if ( bl2_zero_dim2( m, n ) ) return; \
\
conja = bl2_extract_conj( transa ); \
\
for ( iter = 0; iter < m; ++iter ) \
{ \
i = m - iter - 1; \
n_behind = iter; \
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
a21 = a_cast + (i+1)*rs_a + (i )*cs_a; \
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
b2 = b_cast + (i+1)*rs_b + (0 )*cs_b; \
\
/* B2 = B2 + alpha * a21 * b1; */ \
PASTEMAC(ch,ger)( conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
n, \
alpha_cast, \
a21, rs_a, \
b1, cs_b, \
b2, rs_b, cs_b ); \
\
/* b1 = alpha * alpha11 * b1; */ \
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
\
if ( bl2_is_nonunit_diag( diag ) ) \
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
\
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
n, \
&alpha_alpha11_conj, \
b1, cs_b ); \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_ll_unb_var2 )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_ll_unb_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT_BASIC( trmm_ll_unb_var2 )

View File

@@ -1,134 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T trmm_fp
typedef void (*FUNCPTR_T)(
trans_t transa,
diag_t diag,
dim_t m,
dim_t n,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_strmm_ll_unb_var3,
bl2_ctrmm_ll_unb_var3,
bl2_dtrmm_ll_unb_var3,
bl2_ztrmm_ll_unb_var3
};
void bl2_trmm_ll_unb_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
num_t dt_a = bl2_obj_datatype( *a );
trans_t transa = bl2_obj_conjtrans_status( *a );
diag_t diag = bl2_obj_diag( *a );
dim_t m = bl2_obj_length( *b );
dim_t n = bl2_obj_width( *b );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a];
// Invoke the function.
f( transa,
diag,
m,
n,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype* alpha_cast = alpha; \
ctype* a_cast = a; \
ctype* b_cast = b; \
ctype* b1; \
dim_t j; \
\
if ( bl2_zero_dim2( m, n ) ) return; \
\
for ( j = 0; j < n; ++j ) \
{ \
b1 = b_cast + (0 )*rs_b + (j )*cs_b; \
\
/* b1 = alpha * tril( A ) * b1; */ \
PASTEMAC2(ch,ch,trmv)( BLIS_LOWER, \
transa, \
diag, \
m, \
alpha_cast, \
a_cast, rs_a, cs_a, \
b1, rs_b ); \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_ll_unb_var3 )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_ll_unb_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT_BASIC( trmm_ll_unb_var3 )

View File

@@ -1,126 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_trmm_lu_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
obj_t a11, a11_pack;
obj_t a12;
obj_t b1, b1_pack;
obj_t b2;
dim_t ij;
dim_t b_alg;
dim_t mn;
// Initialize objects for packing.
bl2_obj_init_pack( &a11_pack );
bl2_obj_init_pack( &b1_pack );
// Query dimension. Since A should be square, any transposition
// embedded in the object can be ignored.
mn = bl2_obj_length( *a );
// Scale B by alpha (if instructed).
bl2_scalm_int( alpha,
b,
cntl_sub_scalm( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bl2_determine_blocksize_f( ij, mn,
a,
cntl_blocksize( cntl ) );
// Acquire partitions for A11 and A12.
bl2_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bl2_acquire_mpart_tl2br( BLIS_SUBPART12,
ij, b_alg, a, &a12 );
// Acquire partitions for B1 and B2.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
ij, b_alg, b, &b1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART2,
ij, b_alg, b, &b2 );
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a11,
&a11_pack,
cntl_sub_packm_a( cntl ) );
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&b1,
&b1_pack,
cntl_sub_packm_b( cntl ) );
// B1 = triu( A11 ) * B1;
bl2_trmm_int( BLIS_LEFT,
&BLIS_ONE,
&a11_pack,
&b1_pack,
cntl_sub_trmm( cntl ) );
// B1 = B1 + A12 * B2;
bl2_gemm_int( &BLIS_ONE,
&a12,
&b2,
&BLIS_ONE,
&b1_pack,
cntl_sub_gemm( cntl ) );
// Copy/unpack B1 (if B1 was packed).
bl2_unpackm_int( &b1_pack,
&b1,
cntl_sub_unpackm_b( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a11_pack );
bl2_obj_release_pack( &b1_pack );
}

View File

@@ -1,39 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_lu_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );

View File

@@ -1,126 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_trmm_lu_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
obj_t a11, a11_pack;
obj_t a01;
obj_t b1, b1_pack;
obj_t b0;
dim_t ij;
dim_t b_alg;
dim_t mn;
// Initialize objects for packing.
bl2_obj_init_pack( &a11_pack );
bl2_obj_init_pack( &b1_pack );
// Query dimension. Since A should be square, any transposition
// embedded in the object can be ignored.
mn = bl2_obj_length( *a );
// Scale B by alpha (if instructed).
bl2_scalm_int( alpha,
b,
cntl_sub_scalm( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bl2_determine_blocksize_f( ij, mn,
a,
cntl_blocksize( cntl ) );
// Acquire partitions for A11 and A01.
bl2_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bl2_acquire_mpart_tl2br( BLIS_SUBPART01,
ij, b_alg, a, &a01 );
// Acquire partitions for B1 and B0.
bl2_acquire_mpart_t2b( BLIS_SUBPART1,
ij, b_alg, b, &b1 );
bl2_acquire_mpart_t2b( BLIS_SUBPART0,
ij, b_alg, b, &b0 );
// Copy/pack A11 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&a11,
&a11_pack,
cntl_sub_packm_a( cntl ) );
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&b1,
&b1_pack,
cntl_sub_packm_b( cntl ) );
// B0 = B0 + A01 * B1;
bl2_gemm_int( &BLIS_ONE,
&a01,
&b1_pack,
&BLIS_ONE,
&b0,
cntl_sub_gemm( cntl ) );
// B1 = triu( A11 ) * B1;
bl2_trmm_int( BLIS_LEFT,
&BLIS_ONE,
&a11_pack,
&b1_pack,
cntl_sub_trmm( cntl ) );
// Copy/unpack B1 (if B1 was packed).
bl2_unpackm_int( &b1_pack,
&b1,
cntl_sub_unpackm_b( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a11_pack );
bl2_obj_release_pack( &b1_pack );
}

View File

@@ -1,39 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_lu_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );

View File

@@ -1,107 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_trmm_lu_blk_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
obj_t a_pack;
obj_t b1, b1_pack;
dim_t j;
dim_t b_alg;
dim_t n_trans;
// Initialize objects for packing.
bl2_obj_init_pack( &a_pack );
bl2_obj_init_pack( &b1_pack );
// Query dimension in partitioning direction.
n_trans = bl2_obj_width_after_trans( *b );
// Scale B by alpha (if instructed).
bl2_scalm_int( alpha,
b,
cntl_sub_scalm( cntl ) );
// Partition along the n dimension.
for ( j = 0; j < n_trans; j += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bl2_determine_blocksize_f( j, n_trans,
a,
cntl_blocksize( cntl ) );
// Acquire partitions for B1.
bl2_acquire_mpart_l2r( BLIS_SUBPART1,
j, b_alg, b, &b1 );
// Copy/pack A (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
a,
&a_pack,
cntl_sub_packm_a( cntl ) );
// Copy/pack B1 (if instructed) and scale by alpha (if instructed).
bl2_packm_int( alpha,
&b1,
&b1_pack,
cntl_sub_packm_b( cntl ) );
// B1 = triu( A ) * B1;
bl2_trmm_int( BLIS_LEFT,
alpha,
&a_pack,
&b1_pack,
cntl_sub_trmm( cntl ) );
// Copy/unpack B1 (if B1 was packed).
bl2_unpackm_int( &b1_pack,
&b1,
cntl_sub_unpackm_b( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bl2_obj_release_pack( &a_pack );
bl2_obj_release_pack( &b1_pack );
}

View File

@@ -1,39 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_lu_blk_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );

View File

@@ -1,162 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T trmm_fp
typedef void (*FUNCPTR_T)(
trans_t transa,
diag_t diag,
dim_t m,
dim_t n,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_strmm_lu_unb_var1,
bl2_ctrmm_lu_unb_var1,
bl2_dtrmm_lu_unb_var1,
bl2_ztrmm_lu_unb_var1
};
void bl2_trmm_lu_unb_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
num_t dt_a = bl2_obj_datatype( *a );
trans_t transa = bl2_obj_conjtrans_status( *a );
diag_t diag = bl2_obj_diag( *a );
dim_t m = bl2_obj_length( *b );
dim_t n = bl2_obj_width( *b );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a];
// Invoke the function.
f( transa,
diag,
m,
n,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype* alpha_cast = alpha; \
ctype* a_cast = a; \
ctype* b_cast = b; \
ctype* one = PASTEMAC(ch,1); \
ctype* alpha11; \
ctype* a12t; \
ctype* b1; \
ctype* b2; \
ctype alpha_alpha11_conj; \
dim_t iter, i; \
dim_t n_ahead; \
conj_t conja; \
\
if ( bl2_zero_dim2( m, n ) ) return; \
\
conja = bl2_extract_conj( transa ); \
\
for ( iter = 0; iter < m; ++iter ) \
{ \
i = iter; \
n_ahead = m - i - 1; \
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
a12t = a_cast + (i )*rs_a + (i+1)*cs_a; \
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
b2 = b_cast + (i+1)*rs_b + (0 )*cs_b; \
\
/* b1 = alpha * alpha11 * b1; */ \
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
\
if ( bl2_is_nonunit_diag( diag ) ) \
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
\
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
n, \
&alpha_alpha11_conj, \
b1, cs_b ); \
\
/* b1 = b1 + alpha * a12t * B2; */ \
/* = b1 + alpha * B2^T * a12t^T; */ \
PASTEMAC(ch,gemv)( BLIS_TRANSPOSE, \
conja, \
n_ahead, \
n, \
alpha_cast, \
b2, rs_b, cs_b, \
a12t, cs_a, \
one, \
b1, cs_b ); \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_lu_unb_var1 )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_lu_unb_var1( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT_BASIC( trmm_lu_unb_var1 )

View File

@@ -1,159 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T trmm_fp
typedef void (*FUNCPTR_T)(
trans_t transa,
diag_t diag,
dim_t m,
dim_t n,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_strmm_lu_unb_var2,
bl2_ctrmm_lu_unb_var2,
bl2_dtrmm_lu_unb_var2,
bl2_ztrmm_lu_unb_var2
};
void bl2_trmm_lu_unb_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
num_t dt_a = bl2_obj_datatype( *a );
trans_t transa = bl2_obj_conjtrans_status( *a );
diag_t diag = bl2_obj_diag( *a );
dim_t m = bl2_obj_length( *b );
dim_t n = bl2_obj_width( *b );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a];
// Invoke the function.
f( transa,
diag,
m,
n,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype* alpha_cast = alpha; \
ctype* a_cast = a; \
ctype* b_cast = b; \
ctype* a01; \
ctype* alpha11; \
ctype* b0; \
ctype* b1; \
ctype alpha_alpha11_conj; \
dim_t iter, i; \
dim_t n_behind; \
conj_t conja; \
\
if ( bl2_zero_dim2( m, n ) ) return; \
\
conja = bl2_extract_conj( transa ); \
\
for ( iter = 0; iter < m; ++iter ) \
{ \
i = iter; \
n_behind = i; \
a01 = a_cast + (0 )*rs_a + (i )*cs_a; \
alpha11 = a_cast + (i )*rs_a + (i )*cs_a; \
b0 = b_cast + (0 )*rs_b + (0 )*cs_b; \
b1 = b_cast + (i )*rs_b + (0 )*cs_b; \
\
/* B0 = B0 + alpha * a01 * b1; */ \
PASTEMAC(ch,ger)( conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
n, \
alpha_cast, \
a01, rs_a, \
b1, cs_b, \
b0, rs_b, cs_b ); \
\
/* b1 = alpha * alpha11 * b1; */ \
PASTEMAC2(ch,ch,copys)( *alpha_cast, alpha_alpha11_conj ); \
\
if ( bl2_is_nonunit_diag( diag ) ) \
PASTEMAC2(ch,ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
\
PASTEMAC2(ch,ch,scalv)( BLIS_NO_CONJUGATE, \
n, \
&alpha_alpha11_conj, \
b1, cs_b ); \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_lu_unb_var2 )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_lu_unb_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT_BASIC( trmm_lu_unb_var2 )

View File

@@ -1,134 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T trmm_fp
typedef void (*FUNCPTR_T)(
trans_t transa,
diag_t diag,
dim_t m,
dim_t n,
void* alpha,
void* a, inc_t rs_a, inc_t cs_a,
void* b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T ftypes[BLIS_NUM_FP_TYPES] =
{
bl2_strmm_lu_unb_var3,
bl2_ctrmm_lu_unb_var3,
bl2_dtrmm_lu_unb_var3,
bl2_ztrmm_lu_unb_var3
};
void bl2_trmm_lu_unb_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl )
{
num_t dt_a = bl2_obj_datatype( *a );
trans_t transa = bl2_obj_conjtrans_status( *a );
diag_t diag = bl2_obj_diag( *a );
dim_t m = bl2_obj_length( *b );
dim_t n = bl2_obj_width( *b );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
void* buf_alpha = bl2_obj_scalar_buffer( dt_a, *alpha );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a];
// Invoke the function.
f( transa,
diag,
m,
n,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype* alpha_cast = alpha; \
ctype* a_cast = a; \
ctype* b_cast = b; \
ctype* b1; \
dim_t j; \
\
if ( bl2_zero_dim2( m, n ) ) return; \
\
for ( j = 0; j < n; ++j ) \
{ \
b1 = b_cast + (0 )*rs_b + (j )*cs_b; \
\
/* b1 = alpha * triu( A ) * b1; */ \
PASTEMAC2(ch,ch,trmv)( BLIS_UPPER, \
transa, \
diag, \
m, \
alpha_cast, \
a_cast, rs_a, cs_a, \
b1, rs_b ); \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_lu_unb_var3 )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bl2_trmm_lu_unb_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
trans_t transa, \
diag_t diag, \
dim_t m, \
dim_t n, \
void* alpha, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT_BASIC( trmm_lu_unb_var3 )

View File

@@ -1,466 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffa,
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,trmm_u_ker_var2);
void bl2_trmm_u_ker_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
doff_t diagoffa = bl2_obj_diag_offset( *a );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
inc_t ps_b = bl2_obj_panel_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// If beta is a scalar constant, use dt_exec to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the beta object and extract the buffer at the beta offset.
// (If beta is complex with a zero imaginary component, that is reflected
// in dt_beta. However, that functionality is not used here.)
bl2_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
m,
n,
k,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
PASTEMAC2(ch,varname,_nr) ]; \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
\
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict bd_i; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t k_ndup; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bl2_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* If the diagonal offset is positive, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that
we don't need to adjust the pointer to A since packm would have
simply skipped over the regions that were not stored. */ \
if ( diagoffa > 0 ) \
{ \
j = diagoffa; \
n = n - j; \
diagoffa = 0; \
b_cast = b_cast + (j )*rs_b; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
off_a1011 = bl2_max( diagoffa, 0 ); \
k_a1011 = k - off_a1011; \
k_ndup = k_a1011 * NDUP; \
\
rstep_a = k * MR; \
\
cstep_b = ps_b; \
\
rstep_c = MR * rs_c; \
cstep_c = NR * cs_c; \
\
b1 = b_cast; \
c1 = c_cast; \
\
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* Copy the current iteration's NR columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
\
/* Interior loop. */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = bl2_max( diagoffa_i, 0 ); \
k_a1011 = k - off_a1011; \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
c11, rs_c, cs_c ); \
\
a1 += k_a1011 * MR; \
} \
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
c11, rs_c, cs_c ); \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
/* Bottom edge handling. */ \
if ( m_left ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = bl2_max( diagoffa_i, 0 ); \
k_a1011 = k - off_a1011; \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, NR, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
if ( n_left ) \
{ \
a1 = a_cast; \
c11 = c1; \
\
/* Copy the n_left (+ padding) columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
\
/* Right edge loop. */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = bl2_max( diagoffa_i, 0 ); \
k_a1011 = k - off_a1011; \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
\
a1 += k_a1011 * MR; \
} \
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( MR, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
if ( bl2_intersects_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = bl2_max( diagoffa_i, 0 ); \
k_a1011 = k - off_a1011; \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k_a1011, \
one, \
a1, \
bd_i, \
beta, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, m_left, k ) ) \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_ukr)( k, \
one, \
a1, \
bd, \
one, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_left, n_left, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC( trmm, trmm_u_ker_var2 )

View File

@@ -1,363 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffa,
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,trsm_l_ker_var2);
void bl2_trsm_l_ker_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
doff_t diagoffa = bl2_obj_diag_offset( *a );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
inc_t ps_b = bl2_obj_panel_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
m,
n,
k,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
PASTEMAC2(ch,varname,_nr) ]; \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
\
/* Alias constants to shorter names. */ \
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict bd01; \
ctype* restrict bd11; \
ctype* restrict bd_i; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_ndup; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a1011, off_b11; \
dim_t i, j; \
dim_t rstep_a; \
dim_t rstep_b, cstep_b; \
dim_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
\
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* The first thing we do is check the k dimension, which needs to be
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
This allows us to use a single micro-kernel, which performs an
MR x MR triangular solve, even for cases when k isn't actually a
multiple of MR. The key is that when A was packed, its edges were
first zero padded, and further, the panel that stores the bottom-
right corner of the matrix has its diagonal that extendeds into
the zero padded region as identity. This allows the trsm of that
bottom-right panel to proceed without producing any infs or NaNs
or any other numerical funny business that would infect the "good"
values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* If the diagonal offset is negative, adjust the pointer to C and
treat this case as if the diagonal offset were zero. Note that
we don't need to adjust the pointer to A since packm would have
simply skipped over the panels that were not stored. */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
k_a1011 = bl2_min( k, diagoffa + m ); \
k_ndup = k_a1011 * NDUP; \
\
off_b11 = diagoffa; \
\
rstep_a = k * MR; \
\
rstep_b = NR * MR; \
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1; \
b11 = b1 + (off_b11 )*NR; \
\
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Copy the current iteration's NR columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bl2_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = 0; \
k_a1011 = bl2_min( k, diagoffa_i + m_cur ); \
k_a10 = k_a1011 - m_cur; \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Compute the addresses of the A10 panel and triangular
block A11, and the corresponding panel Bd01 and block
Bd11. */ \
a10 = a1; \
a11 = a1 + k_a10 * MR; \
bd01 = bd_i; \
bd11 = bd_i + k_a10 * NR * NDUP; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
PASTEMAC2(ch,varname,_trsm_ukr)( k_a10, \
a10, \
a11, \
bd01, \
bd11, \
b11, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
PASTEMAC2(ch,varname,_trsm_ukr)( k_a10, \
a10, \
a11, \
bd01, \
bd11, \
b11, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1011 * MR; \
} \
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
minus_one, \
a1, \
bd, \
one, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
minus_one, \
a1, \
bd, \
zero, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
b11 += rstep_b; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC( trsm, trsm_l_ker_var2 )

View File

@@ -1,367 +0,0 @@
/*
libblis
An object-based infrastructure for developing high-performance
dense linear algebra libraries.
Copyright (C) 2013, The University of Texas
libblis is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
libblis is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with libblis; if you did not receive a copy, see
http://www.gnu.org/licenses/.
For more information, please contact us at blis@cs.utexas.edu or
send mail to:
Field G. Van Zee and/or
Robert A. van de Geijn
The University of Texas at Austin
Institute for Computational Engineering and Science
1 University Station D9500
Austin TX 78712
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffa,
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,trsm_l_ker_var2);
void bl2_trsm_l_ker_var2( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
doff_t diagoffa = bl2_obj_diag_offset( *a );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
inc_t ps_b = bl2_obj_panel_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
m,
n,
k,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
PASTEMAC2(ch,varname,_nr) ]; \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
\
/* Alias constants to shorter names. */ \
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict bd01; \
ctype* restrict bd11; \
ctype* restrict bd_i; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_ndup; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a1011, off_b11; \
dim_t i, j; \
dim_t rstep_a; \
dim_t rstep_b, cstep_b; \
dim_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
\
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* The first thing we do is check the k dimension, which needs to be
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
This allows us to use a single micro-kernel, which performs an
MR x MR triangular solve, even for cases when k isn't actually a
multiple of MR. The key is that when A was packed, its edges were
first zero padded, and further, the panel that stores the bottom-
right corner of the matrix has its diagonal that extendeds into
the zero padded region as identity. This allows the trsm of that
bottom-right panel to proceed without producing any infs or NaNs
or any other numerical funny business that would infect the "good"
values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* If the diagonal offset is negative, adjust the pointer to C and
treat this case as if the diagonal offset were zero. Note that
we don't need to adjust the pointer to A since packm would have
simply skipped over the panels that were not stored. */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
k_a1011 = bl2_min( k, diagoffa + m ); \
k_ndup = k_a1011 * NDUP; \
\
off_b11 = diagoffa; \
\
rstep_a = k * MR; \
\
rstep_b = NR * MR; \
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1; \
b11 = b1 + (off_b11 )*NR; \
\
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Copy the current iteration's NR columns of B to a local buffer
with each value duplicated. */ \
PASTEMAC2(ch,varname,_dupl)( k_ndup, b1, bd ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bl2_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bd. Then compute the length of that panel. */ \
off_a1011 = 0; \
k_a1011 = bl2_min( k, diagoffa_i + m_cur ); \
k_a10 = k_a1011 - m_cur; \
\
bd_i = bd + off_a1011 * NR * NDUP; \
\
/* Compute the addresses of the A10 panel and triangular
block A11, and the corresponding panel Bd01 and block
Bd11. */ \
a10 = a1; \
a11 = a1 + k_a10 * MR; \
bd01 = bd_i; \
bd11 = bd_i + k_a10 * NR * NDUP; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
PASTEMAC2(ch,varname,_gemm_ukr)( k_a10, \
minus_one, \
a10, \
bd01, \
one, \
b11, rs_b, cs_b ); \
PASTEMAC2(ch,varname,_trsmonly_ukr)( k_a10, \
a11, \
bd11, \
b11, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
PASTEMAC2(ch,varname,_trsm_ukr)( k_a10, \
a10, \
a11, \
bd01, \
bd11, \
b11, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1011 * MR; \
} \
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
minus_one, \
a1, \
bd, \
one, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC2(ch,varname,_gemm_ukr)( k, \
minus_one, \
a1, \
bd, \
zero, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
b11 += rstep_b; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC( trsm, trsm_l_ker_var2 )

View File

@@ -1,386 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffa,
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,trsm_l_ker_var3);
void bl2_trsm_l_ker_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
doff_t diagoffa = bl2_obj_diag_offset( *a );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
inc_t ps_b = bl2_obj_panel_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
m,
n,
k,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, trsmukr, gemmukr ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
PASTEMAC2(ch,varname,_nr) ]; \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
\
/* Alias constants to shorter names. */ \
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
const bool_t DUPB = PASTEMAC2(ch,varname,_dupb); \
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict bp01; \
ctype* restrict bp11; \
ctype* restrict bp_i; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a1011, off_b11; \
dim_t i, j; \
dim_t rstep_a; \
dim_t rstep_b, cstep_b; \
dim_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bl2_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* The first thing we do is check the k dimension, which needs to be
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
This allows us to use a single micro-kernel, which performs an
MR x MR triangular solve, even for cases when k isn't actually a
multiple of MR. The key is that when A was packed, its edges were
first zero padded, and further, the panel that stores the bottom-
right corner of the matrix has its diagonal that extendeds into
the zero padded region as identity. This allows the trsm of that
bottom-right panel to proceed without producing any infs or NaNs
or any other numerical funny business that would infect the "good"
values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* If the diagonal offset is negative, adjust the pointer to C and
treat this case as if the diagonal offset were zero. Note that
we don't need to adjust the pointer to A since packm would have
simply skipped over the panels that were not stored. */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_a1011 = bl2_min( k, diagoffa + m ); \
k_nr = k_a1011 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * MR; \
\
rstep_b = NR * MR; \
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
off_b11 = diagoffa; \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1; \
b11 = b1 + (off_b11 )*NR; \
\
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bd ); \
else bp = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bl2_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bp. Then compute the length of that panel. */ \
off_a1011 = 0; \
k_a1011 = bl2_min( k, diagoffa_i + MR ); \
k_a10 = k_a1011 - MR; \
\
bp_i = bp + off_a1011 * NR * NDUP; \
\
/* Compute the addresses of the A10 panel and triangular
block A11, and the corresponding panel Bd01 and block
Bd11. */ \
a10 = a1; \
a11 = a1 + k_a10 * MR; \
bp01 = bp_i; \
bp11 = bp_i + k_a10 * NR * NDUP; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm and trsm micro-kernels. */ \
PASTEMAC(ch,gemmukr)( k_a10, \
minus_one, \
a10, \
bp01, \
one, \
b11, rs_b, cs_b ); \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bp11, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the gemm and trsm micro-kernels. */ \
PASTEMAC(ch,gemmukr)( k_a10, \
minus_one, \
a10, \
bp01, \
one, \
b11, rs_b, cs_b ); \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bp11, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1011 * MR; \
} \
else if ( bl2_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
one, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
b11 += rstep_b; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC2( trsm_l_ker_var3, TRSM_L_UKERNEL, GEMM_UKERNEL )

View File

@@ -1,96 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define macro-kernel blocksizes.
//
// NOTE: These MR and NR values below MUST match the values that packm uses
// when initializing its control tree node.
//
#define bl2_strsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_strsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_S
#define bl2_strsm_l_ker_var3_kc BLIS_DEFAULT_KC_S
#define bl2_strsm_l_ker_var3_mr BLIS_DEFAULT_MR_S
#define bl2_strsm_l_ker_var3_nr BLIS_DEFAULT_NR_S
#define bl2_dtrsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_dtrsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_D
#define bl2_dtrsm_l_ker_var3_kc BLIS_DEFAULT_KC_D
#define bl2_dtrsm_l_ker_var3_mr BLIS_DEFAULT_MR_D
#define bl2_dtrsm_l_ker_var3_nr BLIS_DEFAULT_NR_D
#define bl2_ctrsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_ctrsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_C
#define bl2_ctrsm_l_ker_var3_kc BLIS_DEFAULT_KC_C
#define bl2_ctrsm_l_ker_var3_mr BLIS_DEFAULT_MR_C
#define bl2_ctrsm_l_ker_var3_nr BLIS_DEFAULT_NR_C
#define bl2_ztrsm_l_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_ztrsm_l_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_Z
#define bl2_ztrsm_l_ker_var3_kc BLIS_DEFAULT_KC_Z
#define bl2_ztrsm_l_ker_var3_mr BLIS_DEFAULT_MR_Z
#define bl2_ztrsm_l_ker_var3_nr BLIS_DEFAULT_NR_Z
//
// Prototype object-based interface.
//
void bl2_trsm_l_ker_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl );
//
// Prototype BLAS-like interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_l_ker_var3 )

View File

@@ -1,388 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffa,
dim_t m,
dim_t n,
dim_t k,
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
void* c, inc_t rs_c, inc_t cs_c
);
static FUNCPTR_T GENARRAY(ftypes,trsm_u_ker_var3);
void bl2_trsm_u_ker_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl )
{
num_t dt_exec = bl2_obj_execution_datatype( *c );
doff_t diagoffa = bl2_obj_diag_offset( *a );
dim_t m = bl2_obj_length( *c );
dim_t n = bl2_obj_width( *c );
dim_t k = bl2_obj_width( *a );
void* buf_a = bl2_obj_buffer_at_off( *a );
inc_t rs_a = bl2_obj_row_stride( *a );
inc_t cs_a = bl2_obj_col_stride( *a );
inc_t ps_a = bl2_obj_panel_stride( *a );
void* buf_b = bl2_obj_buffer_at_off( *b );
inc_t rs_b = bl2_obj_row_stride( *b );
inc_t cs_b = bl2_obj_col_stride( *b );
inc_t ps_b = bl2_obj_panel_stride( *b );
void* buf_c = bl2_obj_buffer_at_off( *c );
inc_t rs_c = bl2_obj_row_stride( *c );
inc_t cs_c = bl2_obj_col_stride( *c );
FUNCPTR_T f;
/*
// Handle the special case where c and a are complex and b is real.
// Note that this is the ONLY case allowed by the inner kernel whereby
// the datatypes of a and b differ. In this situation, the execution
// datatype is real, so we need to inflate (by a factor of two):
// - the m dimension,
// - the column stride of c,
// - the column stride (ie: the panel length) of a, and
// - the panel stride of a.
if ( bl2_obj_is_complex( *a ) && bl2_obj_is_real( *b ) )
{
m *= 2;
cs_c *= 2;
cs_a *= 2;
ps_a *= 2;
}
*/
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
m,
n,
k,
buf_a, rs_a, cs_a, ps_a,
buf_b, rs_b, cs_b, ps_b,
buf_c, rs_c, cs_c );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, trsmukr, gemmukr ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC2(ch,varname,_kc) * \
PASTEMAC2(ch,varname,_nr) * \
PASTEMAC2(ch,varname,_ndup) ]; \
ctype* restrict bp; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC2(ch,varname,_mr) * \
PASTEMAC2(ch,varname,_nr) ]; \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC2(ch,varname,_mr); \
\
/* Alias constants to shorter names. */ \
const dim_t MR = PASTEMAC2(ch,varname,_mr); \
const dim_t NR = PASTEMAC2(ch,varname,_nr); \
const bool_t DUPB = PASTEMAC2(ch,varname,_dupb); \
const dim_t NDUP = PASTEMAC2(ch,varname,_ndup); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict a1; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict a12; \
ctype* restrict a11; \
ctype* restrict bp21; \
ctype* restrict bp11; \
ctype* restrict bp_i; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_nr; \
dim_t k_a1112; \
dim_t k_a11, k_a12; \
dim_t off_a1112, off_b11; \
dim_t i, j, ib; \
dim_t rstep_a; \
dim_t rstep_b, cstep_b; \
dim_t rstep_c, cstep_c; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == GEMM_MR
ps_a == stride to next row panel of A
rs_b == GEMM_NR
cs_b == 1
ps_b == stride to next column panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bl2_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bl2_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* The first thing we do is check the k dimension, which needs to be
a multiple of MR. If k isn't a multiple of MR, we adjust it higher.
This allows us to use a single micro-kernel, which performs an
MR x MR triangular solve, even for cases when k isn't actually a
multiple of MR. The key is that when A was packed, its edges were
first zero padded, and further, the panel that stores the bottom-
right corner of the matrix has its diagonal that extendeds into
the zero padded region as identity. This allows the trsm of that
bottom-right panel to proceed without producing any infs or NaNs
or any other numerical funny business that would infect the "good"
values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* If the diagonal offset is positive, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that
we don't need to adjust the pointer to A since packm would have
simply skipped over the panels that were not stored. */ \
if ( diagoffa > 0 ) \
{ \
j = diagoffa; \
k = k - j; \
diagoffa = 0; \
b_cast = b_cast + (j )*rs_b; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Compute the number of elements in B to duplicate per iteration. */ \
k_a1112 = k; \
k_nr = k_a1112 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * MR; \
\
rstep_b = NR * MR; \
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
off_b11 = 0; \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* If the micro-kernel needs elements of B duplicated, set bp to
point to the duplication buffer. If no duplication is called for,
bp will be set to the current column panel of B for each iteration
of the outer loop below. */ \
if ( DUPB ) bp = bd; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
b11 = b1 + (m_iter-1)*rstep_b; \
\
n_cur = ( bl2_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* If duplication is needed, copy the current iteration's NR
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bd ); \
else bp = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bl2_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bl2_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in bp. Then compute the length of that panel. */ \
off_a1112 = bl2_max( diagoffa_i, 0 ); \
k_a1112 = k - off_a1112;; \
k_a12 = k_a1112 - MR; \
k_a11 = MR; \
\
bp_i = bp + off_a1112 * NR * NDUP; \
\
/* Compute the addresses of the A12 panel and triangular
block A11, and the corresponding panel Bd21 and block
Bd11. */ \
a11 = a1; \
a12 = a1 + k_a11 * MR; \
bp11 = bp_i; \
bp21 = bp_i + k_a11 * NR * NDUP; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm and trsm micro-kernels. */ \
PASTEMAC(ch,gemmukr)( k_a12, \
minus_one, \
a12, \
bp21, \
one, \
b11, rs_b, cs_b ); \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bp11, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
PASTEMAC(ch,gemmukr)( k_a12, \
minus_one, \
a12, \
bp21, \
one, \
b11, rs_b, cs_b ); \
PASTEMAC(ch,trsmukr)( a11, \
b11, \
bp11, \
ct, rs_ct, cs_ct ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1112 * MR; \
} \
else if ( bl2_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
one, \
c11, rs_c, cs_c ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,gemmukr)( k, \
minus_one, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
b11 -= rstep_b; \
c11 -= rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC2( trsm_u_ker_var3, TRSM_U_UKERNEL, GEMM_UKERNEL )

View File

@@ -1,96 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define macro-kernel blocksizes.
//
// NOTE: These MR and NR values below MUST match the values that packm uses
// when initializing its control tree node.
//
#define bl2_strsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_strsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_S
#define bl2_strsm_u_ker_var3_kc BLIS_DEFAULT_KC_S
#define bl2_strsm_u_ker_var3_mr BLIS_DEFAULT_MR_S
#define bl2_strsm_u_ker_var3_nr BLIS_DEFAULT_NR_S
#define bl2_dtrsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_dtrsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_D
#define bl2_dtrsm_u_ker_var3_kc BLIS_DEFAULT_KC_D
#define bl2_dtrsm_u_ker_var3_mr BLIS_DEFAULT_MR_D
#define bl2_dtrsm_u_ker_var3_nr BLIS_DEFAULT_NR_D
#define bl2_ctrsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_ctrsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_C
#define bl2_ctrsm_u_ker_var3_kc BLIS_DEFAULT_KC_C
#define bl2_ctrsm_u_ker_var3_mr BLIS_DEFAULT_MR_C
#define bl2_ctrsm_u_ker_var3_nr BLIS_DEFAULT_NR_C
#define bl2_ztrsm_u_ker_var3_dupb BLIS_DEFAULT_DUPLICATE_B
#define bl2_ztrsm_u_ker_var3_ndup BLIS_DEFAULT_NUM_DUPL_Z
#define bl2_ztrsm_u_ker_var3_kc BLIS_DEFAULT_KC_Z
#define bl2_ztrsm_u_ker_var3_mr BLIS_DEFAULT_MR_Z
#define bl2_ztrsm_u_ker_var3_nr BLIS_DEFAULT_NR_Z
//
// Prototype object-based interface.
//
void bl2_trsm_u_ker_var3( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl );
//
// Prototype BLAS-like interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffa, \
dim_t m, \
dim_t n, \
dim_t k, \
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
void* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_u_ker_var3 )

View File

@@ -1,200 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_strsm_l_4x2(
float* restrict a11,
float* restrict b11,
float* restrict bd11,
float* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_dtrsm_l_4x2(
double* restrict a11,
double* restrict b11,
double* restrict bd11,
double* restrict c11, inc_t rs_c, inc_t cs_c
)
{
const dim_t rs_a = 1;
const dim_t cs_a = 4;
const dim_t rs_b = 2;
const dim_t cs_b = 1;
const dim_t NDUP = 2;
const dim_t inc_bd = cs_b*NDUP;
double beta00, beta01;
double beta10, beta11;
double beta20, beta21;
double beta30, beta31;
double alpha00;
double alpha10, alpha11;
double alpha20, alpha21, alpha22;
double alpha30, alpha31, alpha32, alpha33;
beta00 = *(b11 + 0*rs_b + 0*cs_b);
beta01 = *(b11 + 0*rs_b + 1*cs_b);
beta10 = *(b11 + 1*rs_b + 0*cs_b);
beta11 = *(b11 + 1*rs_b + 1*cs_b);
beta20 = *(b11 + 2*rs_b + 0*cs_b);
beta21 = *(b11 + 2*rs_b + 1*cs_b);
beta30 = *(b11 + 3*rs_b + 0*cs_b);
beta31 = *(b11 + 3*rs_b + 1*cs_b);
// iteration 0
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
beta00 -= 0.0;
beta01 -= 0.0;
beta00 *= alpha00;
beta01 *= alpha00;
*(b11 + 0*rs_b + 0*cs_b) = beta00;
*(b11 + 0*rs_b + 1*cs_b) = beta01;
*(c11 + 0*rs_c + 0*cs_c) = beta00;
*(c11 + 0*rs_c + 1*cs_c) = beta01;
// iteration 1
alpha10 = *(a11 + 1*rs_a + 0*cs_a);
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
beta10 -= alpha10 * beta00;
beta11 -= alpha10 * beta01;
beta10 *= alpha11;
beta11 *= alpha11;
*(b11 + 1*rs_b + 0*cs_b) = beta10;
*(b11 + 1*rs_b + 1*cs_b) = beta11;
*(c11 + 1*rs_c + 0*cs_c) = beta10;
*(c11 + 1*rs_c + 1*cs_c) = beta11;
// iteration 2
alpha20 = *(a11 + 2*rs_a + 0*cs_a);
alpha21 = *(a11 + 2*rs_a + 1*cs_a);
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
beta20 -= alpha20 * beta00 +
alpha21 * beta10;
beta21 -= alpha20 * beta01 +
alpha21 * beta11;
beta20 *= alpha22;
beta21 *= alpha22;
*(b11 + 2*rs_b + 0*cs_b) = beta20;
*(b11 + 2*rs_b + 1*cs_b) = beta21;
*(c11 + 2*rs_c + 0*cs_c) = beta20;
*(c11 + 2*rs_c + 1*cs_c) = beta21;
// iteration 3
alpha30 = *(a11 + 3*rs_a + 0*cs_a);
alpha31 = *(a11 + 3*rs_a + 1*cs_a);
alpha32 = *(a11 + 3*rs_a + 2*cs_a);
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
beta30 -= alpha30 * beta00 +
alpha31 * beta10 +
alpha32 * beta20;
beta31 -= alpha30 * beta01 +
alpha31 * beta11 +
alpha32 * beta21;
beta30 *= alpha33;
beta31 *= alpha33;
*(b11 + 3*rs_b + 0*cs_b) = beta30;
*(b11 + 3*rs_b + 1*cs_b) = beta31;
*(c11 + 3*rs_c + 0*cs_c) = beta30;
*(c11 + 3*rs_c + 1*cs_c) = beta31;
// update bd
*(bd11 + 0*inc_bd + 0*cs_b) = beta00;
*(bd11 + 0*inc_bd + 1*cs_b) = beta00;
*(bd11 + 1*inc_bd + 0*cs_b) = beta01;
*(bd11 + 1*inc_bd + 1*cs_b) = beta01;
*(bd11 + 2*inc_bd + 0*cs_b) = beta10;
*(bd11 + 2*inc_bd + 1*cs_b) = beta10;
*(bd11 + 3*inc_bd + 0*cs_b) = beta11;
*(bd11 + 3*inc_bd + 1*cs_b) = beta11;
*(bd11 + 4*inc_bd + 0*cs_b) = beta20;
*(bd11 + 4*inc_bd + 1*cs_b) = beta20;
*(bd11 + 5*inc_bd + 0*cs_b) = beta21;
*(bd11 + 5*inc_bd + 1*cs_b) = beta21;
*(bd11 + 6*inc_bd + 0*cs_b) = beta30;
*(bd11 + 6*inc_bd + 1*cs_b) = beta30;
*(bd11 + 7*inc_bd + 0*cs_b) = beta31;
*(bd11 + 7*inc_bd + 1*cs_b) = beta31;
}
void bl2_ctrsm_l_4x2(
scomplex* restrict a11,
scomplex* restrict b11,
scomplex* restrict bd11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_ztrsm_l_4x2(
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict bd11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}

View File

@@ -1,47 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* a11, \
ctype* b11, \
ctype* bd11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_l_4x2 )

View File

@@ -1,222 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_strsm_l_4x4(
float* restrict a11,
float* restrict b11,
float* restrict bd11,
float* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_dtrsm_l_4x4(
double* restrict a11,
double* restrict b11,
double* restrict bd11,
double* restrict c11, inc_t rs_c, inc_t cs_c
)
{
const dim_t rs_a = 1;
const dim_t cs_a = 4;
const dim_t rs_b = 4;
const dim_t cs_b = 1;
double beta00, beta01, beta02, beta03;
double beta10, beta11, beta12, beta13;
double beta20, beta21, beta22, beta23;
double beta30, beta31, beta32, beta33;
double alpha00;
double alpha10, alpha11;
double alpha20, alpha21, alpha22;
double alpha30, alpha31, alpha32, alpha33;
beta00 = *(b11 + 0*rs_b + 0*cs_b);
beta01 = *(b11 + 0*rs_b + 1*cs_b);
beta02 = *(b11 + 0*rs_b + 2*cs_b);
beta03 = *(b11 + 0*rs_b + 3*cs_b);
beta10 = *(b11 + 1*rs_b + 0*cs_b);
beta11 = *(b11 + 1*rs_b + 1*cs_b);
beta12 = *(b11 + 1*rs_b + 2*cs_b);
beta13 = *(b11 + 1*rs_b + 3*cs_b);
beta20 = *(b11 + 2*rs_b + 0*cs_b);
beta21 = *(b11 + 2*rs_b + 1*cs_b);
beta22 = *(b11 + 2*rs_b + 2*cs_b);
beta23 = *(b11 + 2*rs_b + 3*cs_b);
beta30 = *(b11 + 3*rs_b + 0*cs_b);
beta31 = *(b11 + 3*rs_b + 1*cs_b);
beta32 = *(b11 + 3*rs_b + 2*cs_b);
beta33 = *(b11 + 3*rs_b + 3*cs_b);
// iteration 0
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
beta00 -= 0.0;
beta01 -= 0.0;
beta02 -= 0.0;
beta03 -= 0.0;
beta00 *= alpha00;
beta01 *= alpha00;
beta02 *= alpha00;
beta03 *= alpha00;
*(b11 + 0*rs_b + 0*cs_b) = beta00;
*(b11 + 0*rs_b + 1*cs_b) = beta01;
*(b11 + 0*rs_b + 2*cs_b) = beta02;
*(b11 + 0*rs_b + 3*cs_b) = beta03;
*(c11 + 0*rs_c + 0*cs_c) = beta00;
*(c11 + 0*rs_c + 1*cs_c) = beta01;
*(c11 + 0*rs_c + 2*cs_c) = beta02;
*(c11 + 0*rs_c + 3*cs_c) = beta03;
// iteration 1
alpha10 = *(a11 + 1*rs_a + 0*cs_a);
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
beta10 -= alpha10 * beta00;
beta11 -= alpha10 * beta01;
beta12 -= alpha10 * beta02;
beta13 -= alpha10 * beta03;
beta10 *= alpha11;
beta11 *= alpha11;
beta12 *= alpha11;
beta13 *= alpha11;
*(b11 + 1*rs_b + 0*cs_b) = beta10;
*(b11 + 1*rs_b + 1*cs_b) = beta11;
*(b11 + 1*rs_b + 2*cs_b) = beta12;
*(b11 + 1*rs_b + 3*cs_b) = beta13;
*(c11 + 1*rs_c + 0*cs_c) = beta10;
*(c11 + 1*rs_c + 1*cs_c) = beta11;
*(c11 + 1*rs_c + 2*cs_c) = beta12;
*(c11 + 1*rs_c + 3*cs_c) = beta13;
// iteration 2
alpha20 = *(a11 + 2*rs_a + 0*cs_a);
alpha21 = *(a11 + 2*rs_a + 1*cs_a);
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
beta20 -= alpha20 * beta00 +
alpha21 * beta10;
beta21 -= alpha20 * beta01 +
alpha21 * beta11;
beta22 -= alpha20 * beta02 +
alpha21 * beta12;
beta23 -= alpha20 * beta03 +
alpha21 * beta13;
beta20 *= alpha22;
beta21 *= alpha22;
beta22 *= alpha22;
beta23 *= alpha22;
*(b11 + 2*rs_b + 0*cs_b) = beta20;
*(b11 + 2*rs_b + 1*cs_b) = beta21;
*(b11 + 2*rs_b + 2*cs_b) = beta22;
*(b11 + 2*rs_b + 3*cs_b) = beta23;
*(c11 + 2*rs_c + 0*cs_c) = beta20;
*(c11 + 2*rs_c + 1*cs_c) = beta21;
*(c11 + 2*rs_c + 2*cs_c) = beta22;
*(c11 + 2*rs_c + 3*cs_c) = beta23;
// iteration 3
alpha30 = *(a11 + 3*rs_a + 0*cs_a);
alpha31 = *(a11 + 3*rs_a + 1*cs_a);
alpha32 = *(a11 + 3*rs_a + 2*cs_a);
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
beta30 -= alpha30 * beta00 +
alpha31 * beta10 +
alpha32 * beta20;
beta31 -= alpha30 * beta01 +
alpha31 * beta11 +
alpha32 * beta21;
beta32 -= alpha30 * beta02 +
alpha31 * beta12 +
alpha32 * beta22;
beta33 -= alpha30 * beta03 +
alpha31 * beta13 +
alpha32 * beta23;
beta30 *= alpha33;
beta31 *= alpha33;
beta32 *= alpha33;
beta33 *= alpha33;
*(b11 + 3*rs_b + 0*cs_b) = beta30;
*(b11 + 3*rs_b + 1*cs_b) = beta31;
*(b11 + 3*rs_b + 2*cs_b) = beta32;
*(b11 + 3*rs_b + 3*cs_b) = beta33;
*(c11 + 3*rs_c + 0*cs_c) = beta30;
*(c11 + 3*rs_c + 1*cs_c) = beta31;
*(c11 + 3*rs_c + 2*cs_c) = beta32;
*(c11 + 3*rs_c + 3*cs_c) = beta33;
}
void bl2_ctrsm_l_4x4(
scomplex* restrict a11,
scomplex* restrict b11,
scomplex* restrict bd11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_ztrsm_l_4x4(
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict bd11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}

View File

@@ -1,47 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* a11, \
ctype* b11, \
ctype* bd11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_l_4x4 )

View File

@@ -1,200 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_strsm_u_4x2(
float* restrict a11,
float* restrict b11,
float* restrict bd11,
float* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_dtrsm_u_4x2(
double* restrict a11,
double* restrict b11,
double* restrict bd11,
double* restrict c11, inc_t rs_c, inc_t cs_c
)
{
const dim_t rs_a = 1;
const dim_t cs_a = 4;
const dim_t rs_b = 2;
const dim_t cs_b = 1;
const dim_t NDUP = 2;
const dim_t inc_bd = cs_b*NDUP;
double beta00, beta01;
double beta10, beta11;
double beta20, beta21;
double beta30, beta31;
double alpha00, alpha01, alpha02, alpha03;
double alpha11, alpha12, alpha13;
double alpha22, alpha23;
double alpha33;
beta00 = *(b11 + 0*rs_b + 0*cs_b);
beta01 = *(b11 + 0*rs_b + 1*cs_b);
beta10 = *(b11 + 1*rs_b + 0*cs_b);
beta11 = *(b11 + 1*rs_b + 1*cs_b);
beta20 = *(b11 + 2*rs_b + 0*cs_b);
beta21 = *(b11 + 2*rs_b + 1*cs_b);
beta30 = *(b11 + 3*rs_b + 0*cs_b);
beta31 = *(b11 + 3*rs_b + 1*cs_b);
// iteration 0
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
beta30 -= 0.0;
beta31 -= 0.0;
beta30 *= alpha33;
beta31 *= alpha33;
*(b11 + 3*rs_b + 0*cs_b) = beta30;
*(b11 + 3*rs_b + 1*cs_b) = beta31;
*(c11 + 3*rs_c + 0*cs_c) = beta30;
*(c11 + 3*rs_c + 1*cs_c) = beta31;
// iteration 1
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
alpha23 = *(a11 + 2*rs_a + 3*cs_a);
beta20 -= alpha23 * beta30;
beta21 -= alpha23 * beta31;
beta20 *= alpha22;
beta21 *= alpha22;
*(b11 + 2*rs_b + 0*cs_b) = beta20;
*(b11 + 2*rs_b + 1*cs_b) = beta21;
*(c11 + 2*rs_c + 0*cs_c) = beta20;
*(c11 + 2*rs_c + 1*cs_c) = beta21;
// iteration 2
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
alpha12 = *(a11 + 1*rs_a + 2*cs_a);
alpha13 = *(a11 + 1*rs_a + 3*cs_a);
beta10 -= alpha12 * beta20 +
alpha13 * beta30;
beta11 -= alpha12 * beta21 +
alpha13 * beta31;
beta10 *= alpha11;
beta11 *= alpha11;
*(b11 + 1*rs_b + 0*cs_b) = beta10;
*(b11 + 1*rs_b + 1*cs_b) = beta11;
*(c11 + 1*rs_c + 0*cs_c) = beta10;
*(c11 + 1*rs_c + 1*cs_c) = beta11;
// iteration 3
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
alpha01 = *(a11 + 0*rs_a + 1*cs_a);
alpha02 = *(a11 + 0*rs_a + 2*cs_a);
alpha03 = *(a11 + 0*rs_a + 3*cs_a);
beta00 -= alpha01 * beta10 +
alpha02 * beta20 +
alpha03 * beta30;
beta01 -= alpha01 * beta11 +
alpha02 * beta21 +
alpha03 * beta31;
beta00 *= alpha00;
beta01 *= alpha00;
*(b11 + 0*rs_b + 0*cs_b) = beta00;
*(b11 + 0*rs_b + 1*cs_b) = beta01;
*(c11 + 0*rs_c + 0*cs_c) = beta00;
*(c11 + 0*rs_c + 1*cs_c) = beta01;
// update bd
*(bd11 + 0*inc_bd + 0*cs_b) = beta00;
*(bd11 + 0*inc_bd + 1*cs_b) = beta00;
*(bd11 + 1*inc_bd + 0*cs_b) = beta01;
*(bd11 + 1*inc_bd + 1*cs_b) = beta01;
*(bd11 + 2*inc_bd + 0*cs_b) = beta10;
*(bd11 + 2*inc_bd + 1*cs_b) = beta10;
*(bd11 + 3*inc_bd + 0*cs_b) = beta11;
*(bd11 + 3*inc_bd + 1*cs_b) = beta11;
*(bd11 + 4*inc_bd + 0*cs_b) = beta20;
*(bd11 + 4*inc_bd + 1*cs_b) = beta20;
*(bd11 + 5*inc_bd + 0*cs_b) = beta21;
*(bd11 + 5*inc_bd + 1*cs_b) = beta21;
*(bd11 + 6*inc_bd + 0*cs_b) = beta30;
*(bd11 + 6*inc_bd + 1*cs_b) = beta30;
*(bd11 + 7*inc_bd + 0*cs_b) = beta31;
*(bd11 + 7*inc_bd + 1*cs_b) = beta31;
}
void bl2_ctrsm_u_4x2(
scomplex* restrict a11,
scomplex* restrict b11,
scomplex* restrict bd11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_ztrsm_u_4x2(
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict bd11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}

View File

@@ -1,47 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* a11, \
ctype* b11, \
ctype* bd11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_u_4x2 )

View File

@@ -1,223 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
void bl2_strsm_u_4x4(
float* restrict a11,
float* restrict b11,
float* restrict bd11,
float* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_dtrsm_u_4x4(
double* restrict a11,
double* restrict b11,
double* restrict bd11,
double* restrict c11, inc_t rs_c, inc_t cs_c
)
{
const dim_t rs_a = 1;
const dim_t cs_a = 4;
const dim_t rs_b = 4;
const dim_t cs_b = 1;
double beta00, beta01, beta02, beta03;
double beta10, beta11, beta12, beta13;
double beta20, beta21, beta22, beta23;
double beta30, beta31, beta32, beta33;
double alpha00, alpha01, alpha02, alpha03;
double alpha11, alpha12, alpha13;
double alpha22, alpha23;
double alpha33;
beta00 = *(b11 + 0*rs_b + 0*cs_b);
beta01 = *(b11 + 0*rs_b + 1*cs_b);
beta02 = *(b11 + 0*rs_b + 2*cs_b);
beta03 = *(b11 + 0*rs_b + 3*cs_b);
beta10 = *(b11 + 1*rs_b + 0*cs_b);
beta11 = *(b11 + 1*rs_b + 1*cs_b);
beta12 = *(b11 + 1*rs_b + 2*cs_b);
beta13 = *(b11 + 1*rs_b + 3*cs_b);
beta20 = *(b11 + 2*rs_b + 0*cs_b);
beta21 = *(b11 + 2*rs_b + 1*cs_b);
beta22 = *(b11 + 2*rs_b + 2*cs_b);
beta23 = *(b11 + 2*rs_b + 3*cs_b);
beta30 = *(b11 + 3*rs_b + 0*cs_b);
beta31 = *(b11 + 3*rs_b + 1*cs_b);
beta32 = *(b11 + 3*rs_b + 2*cs_b);
beta33 = *(b11 + 3*rs_b + 3*cs_b);
// iteration 0
alpha33 = *(a11 + 3*rs_a + 3*cs_a);
beta30 -= 0.0;
beta31 -= 0.0;
beta32 -= 0.0;
beta33 -= 0.0;
beta30 *= alpha33;
beta31 *= alpha33;
beta32 *= alpha33;
beta33 *= alpha33;
*(b11 + 3*rs_b + 0*cs_b) = beta30;
*(b11 + 3*rs_b + 1*cs_b) = beta31;
*(b11 + 3*rs_b + 2*cs_b) = beta32;
*(b11 + 3*rs_b + 3*cs_b) = beta33;
*(c11 + 3*rs_c + 0*cs_c) = beta30;
*(c11 + 3*rs_c + 1*cs_c) = beta31;
*(c11 + 3*rs_c + 2*cs_c) = beta32;
*(c11 + 3*rs_c + 3*cs_c) = beta33;
// iteration 1
alpha22 = *(a11 + 2*rs_a + 2*cs_a);
alpha23 = *(a11 + 2*rs_a + 3*cs_a);
beta20 -= alpha23 * beta30;
beta21 -= alpha23 * beta31;
beta22 -= alpha23 * beta32;
beta23 -= alpha23 * beta33;
beta20 *= alpha22;
beta21 *= alpha22;
beta22 *= alpha22;
beta23 *= alpha22;
*(b11 + 2*rs_b + 0*cs_b) = beta20;
*(b11 + 2*rs_b + 1*cs_b) = beta21;
*(b11 + 2*rs_b + 2*cs_b) = beta22;
*(b11 + 2*rs_b + 3*cs_b) = beta23;
*(c11 + 2*rs_c + 0*cs_c) = beta20;
*(c11 + 2*rs_c + 1*cs_c) = beta21;
*(c11 + 2*rs_c + 2*cs_c) = beta22;
*(c11 + 2*rs_c + 3*cs_c) = beta23;
// iteration 2
alpha11 = *(a11 + 1*rs_a + 1*cs_a);
alpha12 = *(a11 + 1*rs_a + 2*cs_a);
alpha13 = *(a11 + 1*rs_a + 3*cs_a);
beta10 -= alpha12 * beta20 +
alpha13 * beta30;
beta11 -= alpha12 * beta21 +
alpha13 * beta31;
beta12 -= alpha12 * beta22 +
alpha13 * beta32;
beta13 -= alpha12 * beta23 +
alpha13 * beta33;
beta10 *= alpha11;
beta11 *= alpha11;
beta12 *= alpha11;
beta13 *= alpha11;
*(b11 + 1*rs_b + 0*cs_b) = beta10;
*(b11 + 1*rs_b + 1*cs_b) = beta11;
*(b11 + 1*rs_b + 2*cs_b) = beta12;
*(b11 + 1*rs_b + 3*cs_b) = beta13;
*(c11 + 1*rs_c + 0*cs_c) = beta10;
*(c11 + 1*rs_c + 1*cs_c) = beta11;
*(c11 + 1*rs_c + 2*cs_c) = beta12;
*(c11 + 1*rs_c + 3*cs_c) = beta13;
// iteration 3
alpha00 = *(a11 + 0*rs_a + 0*cs_a);
alpha01 = *(a11 + 0*rs_a + 1*cs_a);
alpha02 = *(a11 + 0*rs_a + 2*cs_a);
alpha03 = *(a11 + 0*rs_a + 3*cs_a);
beta00 -= alpha01 * beta10 +
alpha02 * beta20 +
alpha03 * beta30;
beta01 -= alpha01 * beta11 +
alpha02 * beta21 +
alpha03 * beta31;
beta02 -= alpha01 * beta12 +
alpha02 * beta22 +
alpha03 * beta32;
beta03 -= alpha01 * beta13 +
alpha02 * beta23 +
alpha03 * beta33;
beta00 *= alpha00;
beta01 *= alpha00;
beta02 *= alpha00;
beta03 *= alpha00;
*(b11 + 0*rs_b + 0*cs_b) = beta00;
*(b11 + 0*rs_b + 1*cs_b) = beta01;
*(b11 + 0*rs_b + 2*cs_b) = beta02;
*(b11 + 0*rs_b + 3*cs_b) = beta03;
*(c11 + 0*rs_c + 0*cs_c) = beta00;
*(c11 + 0*rs_c + 1*cs_c) = beta01;
*(c11 + 0*rs_c + 2*cs_c) = beta02;
*(c11 + 0*rs_c + 3*cs_c) = beta03;
}
void bl2_ctrsm_u_4x4(
scomplex* restrict a11,
scomplex* restrict b11,
scomplex* restrict bd11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_ztrsm_u_4x4(
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict bd11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}

View File

@@ -1,47 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* a11, \
ctype* b11, \
ctype* bd11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_u_4x4 )

View File

@@ -1,33 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

View File

@@ -1,127 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis2.h"
#define NDUP_S BLIS_DEFAULT_NUM_DUPL_S
#define NDUP_D BLIS_DEFAULT_NUM_DUPL_D
#define NDUP_C BLIS_DEFAULT_NUM_DUPL_C
#define NDUP_Z BLIS_DEFAULT_NUM_DUPL_Z
#define UNROLL_FAC_S 1
#define UNROLL_FAC_D 8
#define UNROLL_FAC_C 1
#define UNROLL_FAC_Z 1
void bl2_sdupl(
dim_t n_elem,
float* b,
float* bd
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_ddupl(
dim_t n_elem,
double* b,
double* bd
)
{
dim_t n_iter = n_elem / UNROLL_FAC_D;
dim_t n_left = n_elem % UNROLL_FAC_D;
const inc_t rstep_b = UNROLL_FAC_D;
const inc_t step_bd = UNROLL_FAC_D * NDUP_D;
dim_t i;
for ( i = 0; i < n_iter; ++i )
{
*(bd + 0) = *(b + 0);
*(bd + 1) = *(b + 0);
*(bd + 2) = *(b + 1);
*(bd + 3) = *(b + 1);
*(bd + 4) = *(b + 2);
*(bd + 5) = *(b + 2);
*(bd + 6) = *(b + 3);
*(bd + 7) = *(b + 3);
*(bd + 8) = *(b + 4);
*(bd + 9) = *(b + 4);
*(bd + 10) = *(b + 5);
*(bd + 11) = *(b + 5);
*(bd + 12) = *(b + 6);
*(bd + 13) = *(b + 6);
*(bd + 14) = *(b + 7);
*(bd + 15) = *(b + 7);
b += rstep_b;
bd += step_bd;
}
for ( i = 0; i < n_left; ++i )
{
*(bd + 0) = *(b + 0);
*(bd + 1) = *(b + 0);
b += 1;
bd += NDUP;
}
}
void bl2_cdupl(
dim_t n_elem,
scomplex* b,
scomplex* bd
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bl2_zdupl(
dim_t n_elem,
dcomplex* b,
dcomplex* bd
)
{
bl2_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}

View File

@@ -1,46 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t n_elem, \
ctype* b, \
ctype* bd \
);
INSERT_GENTPROT_BASIC( dupl )