mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Merge branch 'master' into amd
This commit is contained in:
318
ref_kernels/1m/bli_packm_cxk_bb_ref.c
Normal file
318
ref_kernels/1m/bli_packm_cxk_bb_ref.c
Normal file
@@ -0,0 +1,318 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
void* restrict kappa, \
|
||||
void* restrict a, inc_t inca, inc_t lda, \
|
||||
void* restrict p, inc_t ldp, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict alpha1 = a; \
|
||||
ctype* restrict pi1 = p; \
|
||||
\
|
||||
/* Handle the packing of B (column panel schemas) separately from packing
|
||||
of A (row panel schemas). */ \
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
{ \
|
||||
if ( cdim == mnr ) \
|
||||
{ \
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2bbs_mxn) \
|
||||
( \
|
||||
conja, \
|
||||
cdim, \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, 2, ldp \
|
||||
); \
|
||||
\
|
||||
/* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
const dim_t i = cdim; \
|
||||
const dim_t m_edge = mnr - cdim; \
|
||||
const dim_t n_edge = n_max; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (i )*2; \
|
||||
\
|
||||
PASTEMAC(ch,set0bbs_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 2, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if ( n < n_max ) \
|
||||
{ \
|
||||
const dim_t j = n; \
|
||||
const dim_t m_edge = mnr; \
|
||||
const dim_t n_edge = n_max - n; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,set0bbs_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 2, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_row_packed( schema ) ) */ \
|
||||
{ \
|
||||
if ( cdim == mnr ) \
|
||||
{ \
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( dim_t k = n; k != 0; --k ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2s_mxn) \
|
||||
( \
|
||||
conja, \
|
||||
cdim, \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, 1, ldp \
|
||||
); \
|
||||
\
|
||||
/* if ( cdim < mnr ) */ \
|
||||
{ \
|
||||
const dim_t i = cdim; \
|
||||
const dim_t m_edge = mnr - cdim; \
|
||||
const dim_t n_edge = n_max; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (i )*1; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if ( n < n_max ) \
|
||||
{ \
|
||||
const dim_t j = n; \
|
||||
const dim_t m_edge = mnr; \
|
||||
const dim_t n_edge = n_max - n; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict p_edge = p_cast + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -188,6 +189,7 @@ INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -344,6 +346,7 @@ INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -498,6 +501,7 @@ INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -641,6 +645,7 @@ INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -819,6 +824,7 @@ INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -978,6 +984,7 @@ INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -1145,6 +1152,7 @@ INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -1320,6 +1328,7 @@ INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
@@ -1503,6 +1512,7 @@ INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t cdim, \
|
||||
dim_t n, \
|
||||
dim_t n_max, \
|
||||
|
||||
142
ref_kernels/3/bb/bli_gemmbb_ref.c
Normal file
142
ref_kernels/3/bb/bli_gemmbb_ref.c
Normal file
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// An implementation that indexes through B with the assumption that all
|
||||
// elements were broadcast (duplicated) by a factor of NP/NR.
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
\
|
||||
ctype ab[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ab = 1; \
|
||||
const inc_t cs_ab = mr; \
|
||||
\
|
||||
dim_t l, j, i; \
|
||||
\
|
||||
ctype ai; \
|
||||
ctype bj; \
|
||||
\
|
||||
\
|
||||
/* Initialize the accumulator elements in ab to zero. */ \
|
||||
for ( i = 0; i < m * n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,set0s)( *(ab + i) ); \
|
||||
} \
|
||||
\
|
||||
/* Perform a series of k rank-1 updates into ab. */ \
|
||||
for ( l = 0; l < k; ++l ) \
|
||||
{ \
|
||||
ctype* restrict abij = ab; \
|
||||
\
|
||||
/* In an optimized implementation, these two loops over MR and NR
|
||||
are typically fully unrolled. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
bj = *(b + j*cs_b); \
|
||||
\
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ai = *(a + i); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( ai, bj, *abij ); \
|
||||
\
|
||||
abij += rs_ab; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a += cs_a; \
|
||||
b += rs_b; \
|
||||
} \
|
||||
\
|
||||
/* Scale the result in ab by alpha. */ \
|
||||
for ( i = 0; i < m * n; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
|
||||
} \
|
||||
\
|
||||
/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
|
||||
scale by beta and then add the scaled redult in ab. */ \
|
||||
if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copys_mxn)( m, \
|
||||
n, \
|
||||
ab, rs_ab, cs_ab, \
|
||||
c, rs_c, cs_c ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m, \
|
||||
n, \
|
||||
ab, rs_ab, cs_ab, \
|
||||
beta, \
|
||||
c, rs_c, cs_c ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
138
ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
Normal file
138
ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// An implementation that indexes through B with the assumption that all
|
||||
// elements were broadcast (duplicated) by a factor of NP/NR.
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a1x, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bx1, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
/*
|
||||
printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \
|
||||
printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
|
||||
*/ \
|
||||
\
|
||||
ctype* minus_one = PASTEMAC(ch,m1); \
|
||||
\
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
PASTECH(ch,trsm_ukr_ft) \
|
||||
trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
|
||||
(double*)bx1, rs_b, cs_b, "%5.2f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
|
||||
(double*)b11, rs_b, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* lower: b11 = alpha * b11 - a10 * b01; */ \
|
||||
/* upper: b11 = alpha * b11 - a12 * b21; */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1x, \
|
||||
bx1, \
|
||||
alpha, \
|
||||
b11, rs_b, cs_b, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
|
||||
(double*)b11, rs_b, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */ \
|
||||
trsm_ukr \
|
||||
( \
|
||||
a11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
|
||||
(double*)b11, rs_b, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* Broadcast the elements of the updated b11 submatrix to their
|
||||
duplicated neighbors. */ \
|
||||
PASTEMAC(ch,bcastbbs_mxn) \
|
||||
( \
|
||||
mr, \
|
||||
nr, \
|
||||
b11, rs_b, cs_b \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
|
||||
( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \
|
||||
( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
|
||||
INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
|
||||
|
||||
206
ref_kernels/3/bb/bli_trsmbb_ref.c
Normal file
206
ref_kernels/3/bb/bli_trsmbb_ref.c
Normal file
@@ -0,0 +1,206 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// An implementation that indexes through B with the assumption that all
|
||||
// elements were broadcast (duplicated) by a factor of NP/NR.
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
\
|
||||
ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \
|
||||
ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \
|
||||
ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \
|
||||
ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a10t * B0; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype beta11c = *beta11; \
|
||||
ctype rho11; \
|
||||
\
|
||||
/* beta11 = beta11 - a10t * b01; */ \
|
||||
PASTEMAC(ch,set0s)( rho11 ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype* restrict alpha10 = a10t + (l )*cs_a; \
|
||||
ctype* restrict beta01 = b01 + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho11, beta11c ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
|
||||
\
|
||||
/* Store the local value back to b11. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *beta11 ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
\
|
||||
/* Assume that the degree of duplication is equal to packnr / nr. */ \
|
||||
const inc_t cs_b = packnr / nr; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = m - iter - 1; \
|
||||
n_behind = iter; \
|
||||
\
|
||||
ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \
|
||||
ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \
|
||||
ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a12t * B2; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype beta11c = *beta11; \
|
||||
ctype rho11; \
|
||||
\
|
||||
/* beta11 = beta11 - a12t * b21; */ \
|
||||
PASTEMAC(ch,set0s)( rho11 ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype* restrict alpha12 = a12t + (l )*cs_a; \
|
||||
ctype* restrict beta21 = b21 + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \
|
||||
} \
|
||||
PASTEMAC(ch,subs)( rho11, beta11c ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
|
||||
\
|
||||
/* Store the local value back to b11. */ \
|
||||
PASTEMAC(ch,copys)( beta11c, *beta11 ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
Reference in New Issue
Block a user