mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Added a_next, b_next arguments to micro-kernels.
Details: - Added two more arguments to the gemm and gemmtrsm microkernels: the addresses of the next micro-panels of A and B. By passing these pointers into the micro-kernel, we allow the micro-kernel author to prefetch micro-panels of A and B as necessary (though this is completely optional; these addresses may also be safely ignored). - Updated all seven macro-kernels so that they compute and pass in a_next and b_next. Note that ONLY the gemm macro-kernel computes a_next and b_next with the precise semantics we want. I will go back and fix the other macro-kernels in the near future. - Added 'restrict' to various micro-kernels from which it was missing.
This commit is contained in:
@@ -261,10 +261,13 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
//#define GEMM_UKERNEL gemm_ref_mxn
|
||||
#define GEMM_UKERNEL gemm_opt_d4x2
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x2
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x2
|
||||
|
||||
|
||||
@@ -173,6 +173,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
dim_t k_nr; \
|
||||
dim_t m_iter, m_left; \
|
||||
@@ -240,13 +242,22 @@ void PASTEMAC(ch,varname)( \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter_f( i, m_iter, m_left ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
/*if ( i == n_iter - 1 && n_left == 0 )*/ \
|
||||
if ( bli_is_last_iter_f( i, n_iter, n_left ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
@@ -254,7 +265,8 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
|
||||
a1, \
|
||||
bp, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
@@ -263,13 +275,21 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
|
||||
/* Bottom edge handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter_f( i, n_iter, n_left ) ) \
|
||||
b2 = b_cast; \
|
||||
\
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, NR, \
|
||||
@@ -291,17 +311,29 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
|
||||
of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Right edge loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter_f( i, m_iter, m_left ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the right edge of C and add the result from above. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn)( MR, n_left, \
|
||||
@@ -316,13 +348,18 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a_cast; \
|
||||
b2 = b_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the bottom-right corner of C and add the result from above. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, n_left, \
|
||||
@@ -331,6 +368,10 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm_ker_var2, GEMM_UKERNEL )
|
||||
|
||||
@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
{ \
|
||||
const dim_t m = PASTEMAC(ch,mr); \
|
||||
|
||||
@@ -45,7 +45,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ref_mxn )
|
||||
|
||||
@@ -177,6 +177,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t k_nr; \
|
||||
@@ -244,12 +246,20 @@ void PASTEMAC(ch,varname)( \
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b1 + cstep_b; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
@@ -266,7 +276,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \
|
||||
@@ -283,7 +294,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
@@ -294,13 +306,17 @@ void PASTEMAC(ch,varname)( \
|
||||
to factor in here.) */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, NR, \
|
||||
@@ -322,12 +338,20 @@ void PASTEMAC(ch,varname)( \
|
||||
of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b_cast; \
|
||||
\
|
||||
/* Right edge loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
if ( bli_intersects_diag_n( diagoffc_ij, MR, n_left ) ) \
|
||||
{ \
|
||||
@@ -337,7 +361,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \
|
||||
@@ -357,13 +382,17 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \
|
||||
|
||||
@@ -177,6 +177,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t k_nr; \
|
||||
@@ -244,12 +246,20 @@ void PASTEMAC(ch,varname)( \
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b1 + cstep_b; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
@@ -266,7 +276,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \
|
||||
@@ -283,7 +294,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
@@ -295,6 +307,9 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* The following conditional only executes when the bottom edge
|
||||
case for this particular column panel happens to intersect the
|
||||
@@ -307,7 +322,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \
|
||||
@@ -331,18 +347,27 @@ void PASTEMAC(ch,varname)( \
|
||||
of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b1 + cstep_b; \
|
||||
\
|
||||
/* Right edge loop over the m dimension (MR rows at a time). */ \
|
||||
/* (Note that the diagonal is guaranteed not to factor in here.) */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the right edge of C and add the result. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn)( MR, n_left, \
|
||||
@@ -360,13 +385,17 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \
|
||||
|
||||
@@ -181,6 +181,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict bp_i; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
@@ -281,6 +283,9 @@ void PASTEMAC(ch,varname)( \
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b1 + cstep_b; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
@@ -302,6 +307,11 @@ void PASTEMAC(ch,varname)( \
|
||||
k_a1011 = bli_min( k, diagoffa_i + MR ); \
|
||||
\
|
||||
bp_i = bp + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + k_a1011 * PACKMR; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
@@ -312,7 +322,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -327,7 +338,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
|
||||
@@ -339,6 +351,11 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
@@ -348,7 +365,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -358,7 +376,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
|
||||
|
||||
@@ -181,6 +181,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict bp_i; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
@@ -281,6 +283,9 @@ void PASTEMAC(ch,varname)( \
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b1 + cstep_b; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
@@ -305,6 +310,11 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_u_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_u_ker_var2: b1", k_a1112, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + k_a1112 * PACKMR; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
@@ -315,7 +325,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -330,7 +341,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
|
||||
@@ -342,6 +354,11 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
@@ -351,7 +368,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
one, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -361,7 +379,8 @@ void PASTEMAC(ch,varname)( \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \
|
||||
|
||||
@@ -175,6 +175,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bp01; \
|
||||
ctype* restrict bp11; \
|
||||
ctype* restrict bp_i; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
@@ -284,6 +286,9 @@ void PASTEMAC(ch,varname)( \
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b1 + cstep_b; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
@@ -324,6 +329,11 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: a11 (diag)", MR, MR, a11, 1, MR,
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + k_a1011 * PACKMR; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
@@ -336,7 +346,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
|
||||
bp01, \
|
||||
bp11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -348,7 +359,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
|
||||
bp01, \
|
||||
bp11, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
|
||||
@@ -364,6 +376,10 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
@@ -374,7 +390,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "
|
||||
a1, \
|
||||
bp, \
|
||||
alpha_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -384,7 +401,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
|
||||
@@ -175,6 +175,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bp21; \
|
||||
ctype* restrict bp11; \
|
||||
ctype* restrict bp_i; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t m_iter, m_left; \
|
||||
@@ -284,6 +286,9 @@ void PASTEMAC(ch,varname)( \
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Compute the address of the next panel of B. */ \
|
||||
b2 = b1 + cstep_b; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( ib = 0; ib < m_iter; ++ib ) \
|
||||
@@ -338,7 +343,11 @@ printf( "k_a11 = %lu\n", k_a11 ); \
|
||||
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
|
||||
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + k_a1112 * PACKMR; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
@@ -351,7 +360,8 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
|
||||
bp21, \
|
||||
bp11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -363,7 +373,8 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
|
||||
bp21, \
|
||||
bp11, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: bp11 after (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
|
||||
@@ -380,6 +391,11 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 ) \
|
||||
a2 = a_cast; \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
@@ -389,7 +405,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
|
||||
a1, \
|
||||
bp, \
|
||||
alpha_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -399,7 +416,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
|
||||
@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
{ \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
@@ -60,7 +62,9 @@ void PASTEMAC(ch,varname)( \
|
||||
aL, \
|
||||
bdT, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b ); \
|
||||
b, rs_b, cs_b, \
|
||||
a_next, \
|
||||
b_next ); \
|
||||
\
|
||||
/* b = inv(a) * b;
|
||||
bd = b; (if gemm ukernel needs duplicated B)
|
||||
|
||||
@@ -47,7 +47,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_mxn )
|
||||
|
||||
@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
{ \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
@@ -60,7 +62,9 @@ void PASTEMAC(ch,varname)( \
|
||||
aR, \
|
||||
bdB, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b ); \
|
||||
b, rs_b, cs_b, \
|
||||
a_next, \
|
||||
b_next ); \
|
||||
\
|
||||
/* b = inv(a) * b;
|
||||
bd = b; (if gemm ukernel needs duplicated B)
|
||||
|
||||
@@ -47,7 +47,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_mxn )
|
||||
|
||||
@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const dim_t m = PASTEMAC(ch,mr); \
|
||||
|
||||
@@ -43,7 +43,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_ref_mxn )
|
||||
|
||||
@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const dim_t m = PASTEMAC(ch,mr); \
|
||||
|
||||
@@ -43,7 +43,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_u_ref_mxn )
|
||||
|
||||
@@ -424,6 +424,10 @@
|
||||
\
|
||||
( i1 != iter - 1 || left == 0 )
|
||||
|
||||
#define bli_is_last_iter_f( i1, iter, left ) \
|
||||
\
|
||||
( i1 == iter - 1 && left == 0 )
|
||||
|
||||
#define bli_is_edge_b( i1, iter, left ) \
|
||||
\
|
||||
( i1 == 0 && left != 0 )
|
||||
@@ -432,6 +436,10 @@
|
||||
\
|
||||
( i1 != 0 || left == 0 )
|
||||
|
||||
#define bli_is_last_iter_b( i1, iter, left ) \
|
||||
\
|
||||
( i1 == 0 && left == 0 )
|
||||
|
||||
|
||||
// packbuf_t-related
|
||||
|
||||
|
||||
@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
{ \
|
||||
ctype a0; \
|
||||
|
||||
@@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ref_4x2 )
|
||||
|
||||
@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
{ \
|
||||
ctype a0; \
|
||||
|
||||
@@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ref_4x4 )
|
||||
|
||||
@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
{ \
|
||||
ctype* minus_one = PASTEMAC(ch,m1); \
|
||||
@@ -59,7 +61,9 @@ void PASTEMAC(ch,varname)( \
|
||||
aL, \
|
||||
bdT, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b ); \
|
||||
b, rs_b, cs_b, \
|
||||
a_next, \
|
||||
b_next ); \
|
||||
\
|
||||
PASTEMAC(ch,trsmukr)( a, \
|
||||
b, \
|
||||
|
||||
@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_4x4 )
|
||||
|
||||
@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
) \
|
||||
{ \
|
||||
ctype* minus_one = PASTEMAC(ch,m1); \
|
||||
@@ -59,7 +61,8 @@ void PASTEMAC(ch,varname)( \
|
||||
aR, \
|
||||
bdB, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b ); \
|
||||
b, rs_b, cs_b, \
|
||||
a_next, b_next ); \
|
||||
\
|
||||
PASTEMAC(ch,trsmukr)( a, \
|
||||
b, \
|
||||
|
||||
@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_4x4 )
|
||||
|
||||
@@ -35,24 +35,24 @@
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sgemm_opt_d2x4(
|
||||
dim_t k,
|
||||
float* alpha,
|
||||
float* a,
|
||||
float* b,
|
||||
float* beta,
|
||||
float* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_dgemm_opt_d2x4(
|
||||
dim_t k,
|
||||
double* alpha,
|
||||
double* a,
|
||||
double* b,
|
||||
double* beta,
|
||||
double* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
dim_t k_iter;
|
||||
@@ -366,24 +366,24 @@ void bli_dgemm_opt_d2x4(
|
||||
}
|
||||
|
||||
void bli_cgemm_opt_d2x4(
|
||||
dim_t k,
|
||||
scomplex* alpha,
|
||||
scomplex* a,
|
||||
scomplex* b,
|
||||
scomplex* beta,
|
||||
scomplex* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_zgemm_opt_d2x4(
|
||||
dim_t k,
|
||||
dcomplex* alpha,
|
||||
dcomplex* a,
|
||||
dcomplex* b,
|
||||
dcomplex* beta,
|
||||
dcomplex* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -37,12 +37,12 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, \
|
||||
ctype* b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_d2x4 )
|
||||
|
||||
@@ -40,7 +40,9 @@ void bli_sgemm_opt_d4x2(
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
@@ -52,7 +54,9 @@ void bli_dgemm_opt_d4x2(
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
dim_t k_iter;
|
||||
@@ -325,7 +329,9 @@ void bli_cgemm_opt_d4x2(
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
@@ -337,7 +343,9 @@ void bli_zgemm_opt_d4x2(
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_d4x2 )
|
||||
|
||||
@@ -35,26 +35,32 @@
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sgemmtrsm_l_opt_d4x2(
|
||||
dim_t k,
|
||||
float* a10,
|
||||
float* a11,
|
||||
float* bd01,
|
||||
float* bd11,
|
||||
float* b11,
|
||||
float* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a10,
|
||||
float* restrict a11,
|
||||
float* restrict bd01,
|
||||
float* restrict bd11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_dgemmtrsm_l_opt_d4x2(
|
||||
dim_t k,
|
||||
double* a10,
|
||||
double* a11,
|
||||
double* bd01,
|
||||
double* bd11,
|
||||
double* b11,
|
||||
double* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a10,
|
||||
double* restrict a11,
|
||||
double* restrict bd01,
|
||||
double* restrict bd11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
dim_t k_iter;
|
||||
@@ -405,26 +411,32 @@ void bli_dgemmtrsm_l_opt_d4x2(
|
||||
}
|
||||
|
||||
void bli_cgemmtrsm_l_opt_d4x2(
|
||||
dim_t k,
|
||||
scomplex* a10,
|
||||
scomplex* a11,
|
||||
scomplex* bd01,
|
||||
scomplex* bd11,
|
||||
scomplex* b11,
|
||||
scomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a10,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd01,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_zgemmtrsm_l_opt_d4x2(
|
||||
dim_t k,
|
||||
dcomplex* a10,
|
||||
dcomplex* a11,
|
||||
dcomplex* bd01,
|
||||
dcomplex* bd11,
|
||||
dcomplex* b11,
|
||||
dcomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a10,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd01,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -37,13 +37,16 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* a10, \
|
||||
ctype* a11, \
|
||||
ctype* bd01, \
|
||||
ctype* bd11, \
|
||||
ctype* b11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_d4x2 )
|
||||
|
||||
@@ -35,26 +35,32 @@
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sgemmtrsm_u_opt_d4x2(
|
||||
dim_t k,
|
||||
float* a12,
|
||||
float* a11,
|
||||
float* bd21,
|
||||
float* bd11,
|
||||
float* b11,
|
||||
float* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a12,
|
||||
float* restrict a11,
|
||||
float* restrict bd21,
|
||||
float* restrict bd11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_dgemmtrsm_u_opt_d4x2(
|
||||
dim_t k,
|
||||
double* a12,
|
||||
double* a11,
|
||||
double* bd21,
|
||||
double* bd11,
|
||||
double* b11,
|
||||
double* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a12,
|
||||
double* restrict a11,
|
||||
double* restrict bd21,
|
||||
double* restrict bd11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
dim_t k_iter;
|
||||
@@ -408,26 +414,32 @@ void bli_dgemmtrsm_u_opt_d4x2(
|
||||
}
|
||||
|
||||
void bli_cgemmtrsm_u_opt_d4x2(
|
||||
dim_t k,
|
||||
scomplex* a12,
|
||||
scomplex* a11,
|
||||
scomplex* bd21,
|
||||
scomplex* bd11,
|
||||
scomplex* b11,
|
||||
scomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a12,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd21,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_zgemmtrsm_u_opt_d4x2(
|
||||
dim_t k,
|
||||
dcomplex* a12,
|
||||
dcomplex* a11,
|
||||
dcomplex* bd21,
|
||||
dcomplex* bd11,
|
||||
dcomplex* b11,
|
||||
dcomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a12,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd21,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -37,13 +37,16 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* a12, \
|
||||
ctype* a11, \
|
||||
ctype* bd21, \
|
||||
ctype* bd11, \
|
||||
ctype* b11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_d4x2 )
|
||||
|
||||
@@ -35,20 +35,20 @@
|
||||
#include "blis.h"
|
||||
|
||||
void bli_strsm_l_opt_d4x2(
|
||||
float* a11,
|
||||
float* b11,
|
||||
float* bd11,
|
||||
float* c11, inc_t rs_c, inc_t cs_c
|
||||
float* restrict a11,
|
||||
float* restrict b11,
|
||||
float* restrict bd11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_dtrsm_l_opt_d4x2(
|
||||
double* a11,
|
||||
double* b11,
|
||||
double* bd11,
|
||||
double* c11, inc_t rs_c, inc_t cs_c
|
||||
double* restrict a11,
|
||||
double* restrict b11,
|
||||
double* restrict bd11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
__asm__ volatile
|
||||
@@ -185,20 +185,20 @@ void bli_dtrsm_l_opt_d4x2(
|
||||
}
|
||||
|
||||
void bli_ctrsm_l_opt_d4x2(
|
||||
scomplex* a11,
|
||||
scomplex* b11,
|
||||
scomplex* bd11,
|
||||
scomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_ztrsm_l_opt_d4x2(
|
||||
dcomplex* a11,
|
||||
dcomplex* b11,
|
||||
dcomplex* bd11,
|
||||
dcomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -37,10 +37,10 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* a11, \
|
||||
ctype* b11, \
|
||||
ctype* bd11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_opt_d4x2 )
|
||||
|
||||
@@ -35,24 +35,28 @@
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sgemm_opt_d4x4(
|
||||
dim_t k,
|
||||
float* alpha,
|
||||
float* a,
|
||||
float* b,
|
||||
float* beta,
|
||||
float* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_dgemm_opt_d4x4(
|
||||
dim_t k,
|
||||
double* alpha,
|
||||
double* a,
|
||||
double* b,
|
||||
double* beta,
|
||||
double* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
dim_t k_iter;
|
||||
@@ -447,24 +451,28 @@ void bli_dgemm_opt_d4x4(
|
||||
}
|
||||
|
||||
void bli_cgemm_opt_d4x4(
|
||||
dim_t k,
|
||||
scomplex* alpha,
|
||||
scomplex* a,
|
||||
scomplex* b,
|
||||
scomplex* beta,
|
||||
scomplex* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_zgemm_opt_d4x4(
|
||||
dim_t k,
|
||||
dcomplex* alpha,
|
||||
dcomplex* a,
|
||||
dcomplex* b,
|
||||
dcomplex* beta,
|
||||
dcomplex* c, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -37,12 +37,14 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, \
|
||||
ctype* b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_d4x4 )
|
||||
|
||||
@@ -35,26 +35,30 @@
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sgemmtrsm_l_opt_d4x4(
|
||||
dim_t k,
|
||||
float* a10,
|
||||
float* a11,
|
||||
float* bd01,
|
||||
float* bd11,
|
||||
float* b11,
|
||||
float* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
float* restrict a10,
|
||||
float* restrict a11,
|
||||
float* restrict bd01,
|
||||
float* restrict bd11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_dgemmtrsm_l_opt_d4x4(
|
||||
dim_t k,
|
||||
double* a10,
|
||||
double* a11,
|
||||
double* bd01,
|
||||
double* bd11,
|
||||
double* b11,
|
||||
double* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
double* restrict a10,
|
||||
double* restrict a11,
|
||||
double* restrict bd01,
|
||||
double* restrict bd11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
dim_t k_iter;
|
||||
@@ -500,26 +504,30 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
}
|
||||
|
||||
void bli_cgemmtrsm_l_opt_d4x4(
|
||||
dim_t k,
|
||||
scomplex* a10,
|
||||
scomplex* a11,
|
||||
scomplex* bd01,
|
||||
scomplex* bd11,
|
||||
scomplex* b11,
|
||||
scomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
scomplex* restrict a10,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd01,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_zgemmtrsm_l_opt_d4x4(
|
||||
dim_t k,
|
||||
dcomplex* a10,
|
||||
dcomplex* a11,
|
||||
dcomplex* bd01,
|
||||
dcomplex* bd11,
|
||||
dcomplex* b11,
|
||||
dcomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
dcomplex* restrict a10,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd01,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -37,13 +37,15 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* a10, \
|
||||
ctype* a11, \
|
||||
ctype* bd01, \
|
||||
ctype* bd11, \
|
||||
ctype* b11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
dim_t k, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_d4x4 )
|
||||
|
||||
@@ -35,26 +35,30 @@
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sgemmtrsm_u_opt_d4x4(
|
||||
dim_t k,
|
||||
float* a12,
|
||||
float* a11,
|
||||
float* bd21,
|
||||
float* bd11,
|
||||
float* b11,
|
||||
float* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
float* restrict a12,
|
||||
float* restrict a11,
|
||||
float* restrict bd21,
|
||||
float* restrict bd11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_dgemmtrsm_u_opt_d4x4(
|
||||
dim_t k,
|
||||
double* a12,
|
||||
double* a11,
|
||||
double* bd21,
|
||||
double* bd11,
|
||||
double* b11,
|
||||
double* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
double* restrict a12,
|
||||
double* restrict a11,
|
||||
double* restrict bd21,
|
||||
double* restrict bd11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
double* restrict b_next
|
||||
)
|
||||
{
|
||||
dim_t k_iter;
|
||||
@@ -503,26 +507,30 @@ void bli_dgemmtrsm_u_opt_d4x4(
|
||||
}
|
||||
|
||||
void bli_cgemmtrsm_u_opt_d4x4(
|
||||
dim_t k,
|
||||
scomplex* a12,
|
||||
scomplex* a11,
|
||||
scomplex* bd21,
|
||||
scomplex* bd11,
|
||||
scomplex* b11,
|
||||
scomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
scomplex* restrict a12,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd21,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
scomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_zgemmtrsm_u_opt_d4x4(
|
||||
dim_t k,
|
||||
dcomplex* a12,
|
||||
dcomplex* a11,
|
||||
dcomplex* bd21,
|
||||
dcomplex* bd11,
|
||||
dcomplex* b11,
|
||||
dcomplex* c11, inc_t rs_c, inc_t cs_c
|
||||
dim_t k,
|
||||
dcomplex* restrict a12,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd21,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
dcomplex* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
@@ -37,13 +37,15 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* a12, \
|
||||
ctype* a11, \
|
||||
ctype* bd21, \
|
||||
ctype* bd11, \
|
||||
ctype* b11, \
|
||||
ctype* c11, inc_t rs_c, inc_t cs_c \
|
||||
dim_t k, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
ctype* restrict b_next \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_d4x4 )
|
||||
|
||||
Reference in New Issue
Block a user