Added a_next, b_next arguments to micro-kernels.

Details:
- Added two more arguments to the gemm and gemmtrsm microkernels: the
  addresses of the next micro-panels of A and B. By passing these
  pointers into the micro-kernel, we allow the micro-kernel author to
  prefetch micro-panels of A and B as necessary (though this is
  completely optional; these addresses may also be safely ignored).
- Updated all seven macro-kernels so that they compute and pass in
  a_next and b_next. Note that ONLY the gemm macro-kernel computes
  a_next and b_next with the precise semantics we want. I will go back
  and fix the other macro-kernels in the near future.
- Added 'restrict' to various micro-kernels from which it was missing.
This commit is contained in:
Field G. Van Zee
2013-04-23 16:00:18 -05:00
parent f3815dc84d
commit 9d10d7dd9b
43 changed files with 570 additions and 281 deletions

View File

@@ -261,10 +261,13 @@
// -- gemm --
//#define GEMM_UKERNEL gemm_ref_mxn
#define GEMM_UKERNEL gemm_opt_d4x2
// -- trsm-related --
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x2
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x2

View File

@@ -173,6 +173,8 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict a2; \
ctype* restrict b2; \
\
dim_t k_nr; \
dim_t m_iter, m_left; \
@@ -240,13 +242,22 @@ void PASTEMAC(ch,varname)( \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_f( i, m_iter, m_left ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
/*if ( i == n_iter - 1 && n_left == 0 )*/ \
if ( bli_is_last_iter_f( i, n_iter, n_left ) ) \
b2 = b_cast; \
} \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
@@ -254,7 +265,8 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
a1, \
bp, \
beta_cast, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
\
a1 += rstep_a; \
c11 += rstep_c; \
@@ -263,13 +275,21 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
/* Bottom edge handling. */ \
if ( m_left ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter_f( i, n_iter, n_left ) ) \
b2 = b_cast; \
\
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, NR, \
@@ -291,17 +311,29 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Right edge loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_f( i, m_iter, m_left ) ) \
{ \
a2 = a_cast; \
b2 = b_cast; \
} \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the right edge of C and add the result from above. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn)( MR, n_left, \
@@ -316,13 +348,18 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a_cast; \
b2 = b_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the bottom-right corner of C and add the result from above. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, n_left, \
@@ -331,6 +368,10 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "
c11, rs_c, cs_c ); \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( gemm_ker_var2, GEMM_UKERNEL )

View File

@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
{ \
const dim_t m = PASTEMAC(ch,mr); \

View File

@@ -45,7 +45,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemm_ref_mxn )

View File

@@ -177,6 +177,8 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict a2; \
ctype* restrict b2; \
\
doff_t diagoffc_ij; \
dim_t k_nr; \
@@ -244,12 +246,20 @@ void PASTEMAC(ch,varname)( \
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b1 + cstep_b; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
a2 = a_cast; \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
@@ -266,7 +276,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \
@@ -283,7 +294,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
beta_cast, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
\
a1 += rstep_a; \
@@ -294,13 +306,17 @@ void PASTEMAC(ch,varname)( \
to factor in here.) */ \
if ( m_left ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the bottom edge of C and add the result. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, NR, \
@@ -322,12 +338,20 @@ void PASTEMAC(ch,varname)( \
of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b_cast; \
\
/* Right edge loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
a2 = a_cast; \
\
if ( bli_intersects_diag_n( diagoffc_ij, MR, n_left ) ) \
{ \
@@ -337,7 +361,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \
@@ -357,13 +382,17 @@ void PASTEMAC(ch,varname)( \
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \

View File

@@ -177,6 +177,8 @@ void PASTEMAC(ch,varname)( \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict a2; \
ctype* restrict b2; \
\
doff_t diagoffc_ij; \
dim_t k_nr; \
@@ -244,12 +246,20 @@ void PASTEMAC(ch,varname)( \
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b1 + cstep_b; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
a2 = a_cast; \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
@@ -266,7 +276,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \
@@ -283,7 +294,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
beta_cast, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
\
a1 += rstep_a; \
@@ -295,6 +307,9 @@ void PASTEMAC(ch,varname)( \
{ \
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
/* Compute the address of the next panel of A. */ \
a2 = a_cast; \
\
/* The following conditional only executes when the bottom edge
case for this particular column panel happens to intersect the
@@ -307,7 +322,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \
@@ -331,18 +347,27 @@ void PASTEMAC(ch,varname)( \
of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b1 + cstep_b; \
\
/* Right edge loop over the m dimension (MR rows at a time). */ \
/* (Note that the diagonal is guaranteed not to factor in here.) */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 && m_left == 0 ) \
a2 = a_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale the right edge of C and add the result. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn)( MR, n_left, \
@@ -360,13 +385,17 @@ void PASTEMAC(ch,varname)( \
/* Bottom-right corner handling. */ \
if ( m_left ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a_cast; \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \

View File

@@ -181,6 +181,8 @@ void PASTEMAC(ch,varname)( \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict bp_i; \
ctype* restrict a2; \
ctype* restrict b2; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
@@ -281,6 +283,9 @@ void PASTEMAC(ch,varname)( \
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b1 + cstep_b; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -302,6 +307,11 @@ void PASTEMAC(ch,varname)( \
k_a1011 = bli_min( k, diagoffa_i + MR ); \
\
bp_i = bp + off_a1011 * NR * NDUP; \
\
/* Compute the address of the next panel of A. */ \
a2 = a1 + k_a1011 * PACKMR; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -312,7 +322,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp_i, \
beta_cast, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -327,7 +338,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp_i, \
beta_cast, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
@@ -339,6 +351,11 @@ void PASTEMAC(ch,varname)( \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
@@ -348,7 +365,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
one, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -358,7 +376,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \

View File

@@ -181,6 +181,8 @@ void PASTEMAC(ch,varname)( \
ctype* restrict c1; \
ctype* restrict c11; \
ctype* restrict bp_i; \
ctype* restrict a2; \
ctype* restrict b2; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
@@ -281,6 +283,9 @@ void PASTEMAC(ch,varname)( \
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b1 + cstep_b; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -305,6 +310,11 @@ void PASTEMAC(ch,varname)( \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_u_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_u_ker_var2: b1", k_a1112, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
\
/* Compute the address of the next panel of A. */ \
a2 = a1 + k_a1112 * PACKMR; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -315,7 +325,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp_i, \
beta_cast, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -330,7 +341,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp_i, \
beta_cast, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
@@ -342,6 +354,11 @@ void PASTEMAC(ch,varname)( \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
@@ -351,7 +368,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
one, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -361,7 +379,8 @@ void PASTEMAC(ch,varname)( \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \

View File

@@ -175,6 +175,8 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bp01; \
ctype* restrict bp11; \
ctype* restrict bp_i; \
ctype* restrict a2; \
ctype* restrict b2; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
@@ -284,6 +286,9 @@ void PASTEMAC(ch,varname)( \
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b1 + cstep_b; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -324,6 +329,11 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: a11 (diag)", MR, MR, a11, 1, MR,
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/* Compute the address of the next panel of A. */ \
a2 = a1 + k_a1011 * PACKMR; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -336,7 +346,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
bp01, \
bp11, \
b11, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -348,7 +359,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
bp01, \
bp11, \
b11, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \
@@ -364,6 +376,10 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -374,7 +390,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "
a1, \
bp, \
alpha_cast, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -384,7 +401,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_cur, n_cur, \

View File

@@ -175,6 +175,8 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bp21; \
ctype* restrict bp11; \
ctype* restrict bp_i; \
ctype* restrict a2; \
ctype* restrict b2; \
\
doff_t diagoffa_i; \
dim_t m_iter, m_left; \
@@ -284,6 +286,9 @@ void PASTEMAC(ch,varname)( \
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/* Compute the address of the next panel of B. */ \
b2 = b1 + cstep_b; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
@@ -338,7 +343,11 @@ printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
\
/* Compute the address of the next panel of A. */ \
a2 = a1 + k_a1112 * PACKMR; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -351,7 +360,8 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
bp21, \
bp11, \
b11, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -363,7 +373,8 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
bp21, \
bp11, \
b11, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: bp11 after (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
@@ -380,6 +391,11 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* Compute the address of the next panel of A. */ \
a2 = a1 + rstep_a; \
if ( i == m_iter - 1 ) \
a2 = a_cast; \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
@@ -389,7 +405,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
a1, \
bp, \
alpha_cast, \
c11, rs_c, cs_c ); \
c11, rs_c, cs_c, \
a2, b2 ); \
} \
else \
{ \
@@ -399,7 +416,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct ); \
ct, rs_ct, cs_ct, \
a2, b2 ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_cur, n_cur, \

View File

@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
{ \
const inc_t rs_b = PASTEMAC(ch,packnr); \
@@ -60,7 +62,9 @@ void PASTEMAC(ch,varname)( \
aL, \
bdT, \
alpha, \
b, rs_b, cs_b ); \
b, rs_b, cs_b, \
a_next, \
b_next ); \
\
/* b = inv(a) * b;
bd = b; (if gemm ukernel needs duplicated B)

View File

@@ -47,7 +47,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_mxn )

View File

@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
{ \
const inc_t rs_b = PASTEMAC(ch,packnr); \
@@ -60,7 +62,9 @@ void PASTEMAC(ch,varname)( \
aR, \
bdB, \
alpha, \
b, rs_b, cs_b ); \
b, rs_b, cs_b, \
a_next, \
b_next ); \
\
/* b = inv(a) * b;
bd = b; (if gemm ukernel needs duplicated B)

View File

@@ -47,7 +47,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_mxn )

View File

@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \
const dim_t m = PASTEMAC(ch,mr); \

View File

@@ -43,7 +43,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_l_ref_mxn )

View File

@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \
const dim_t m = PASTEMAC(ch,mr); \

View File

@@ -43,7 +43,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict bd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_u_ref_mxn )

View File

@@ -424,6 +424,10 @@
\
( i1 != iter - 1 || left == 0 )
#define bli_is_last_iter_f( i1, iter, left ) \
\
( i1 == iter - 1 && left == 0 )
#define bli_is_edge_b( i1, iter, left ) \
\
( i1 == 0 && left != 0 )
@@ -432,6 +436,10 @@
\
( i1 != 0 || left == 0 )
#define bli_is_last_iter_b( i1, iter, left ) \
\
( i1 == 0 && left == 0 )
// packbuf_t-related

View File

@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
{ \
ctype a0; \

View File

@@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemm_ref_4x2 )

View File

@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
{ \
ctype a0; \

View File

@@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemm_ref_4x4 )

View File

@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
{ \
ctype* minus_one = PASTEMAC(ch,m1); \
@@ -59,7 +61,9 @@ void PASTEMAC(ch,varname)( \
aL, \
bdT, \
alpha, \
b, rs_b, cs_b ); \
b, rs_b, cs_b, \
a_next, \
b_next ); \
\
PASTEMAC(ch,trsmukr)( a, \
b, \

View File

@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdT, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_4x4 )

View File

@@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
) \
{ \
ctype* minus_one = PASTEMAC(ch,m1); \
@@ -59,7 +61,8 @@ void PASTEMAC(ch,varname)( \
aR, \
bdB, \
alpha, \
b, rs_b, cs_b ); \
b, rs_b, cs_b, \
a_next, b_next ); \
\
PASTEMAC(ch,trsmukr)( a, \
b, \

View File

@@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict bdB, \
ctype* restrict bd, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_4x4 )

View File

@@ -35,24 +35,24 @@
#include "blis.h"
void bli_sgemm_opt_d2x4(
dim_t k,
float* alpha,
float* a,
float* b,
float* beta,
float* c, inc_t rs_c, inc_t cs_c
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_dgemm_opt_d2x4(
dim_t k,
double* alpha,
double* a,
double* b,
double* beta,
double* c, inc_t rs_c, inc_t cs_c
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c
)
{
dim_t k_iter;
@@ -366,24 +366,24 @@ void bli_dgemm_opt_d2x4(
}
void bli_cgemm_opt_d2x4(
dim_t k,
scomplex* alpha,
scomplex* a,
scomplex* b,
scomplex* beta,
scomplex* c, inc_t rs_c, inc_t cs_c
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zgemm_opt_d2x4(
dim_t k,
dcomplex* alpha,
dcomplex* a,
dcomplex* b,
dcomplex* beta,
dcomplex* c, inc_t rs_c, inc_t cs_c
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -37,12 +37,12 @@
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* alpha, \
ctype* a, \
ctype* b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( gemm_opt_d2x4 )

View File

@@ -40,7 +40,9 @@ void bli_sgemm_opt_d4x2(
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c
float* restrict c, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
@@ -52,7 +54,9 @@ void bli_dgemm_opt_d4x2(
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c
double* restrict c, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
dim_t k_iter;
@@ -325,7 +329,9 @@ void bli_cgemm_opt_d4x2(
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
@@ -337,7 +343,9 @@ void bli_zgemm_opt_d4x2(
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemm_opt_d4x2 )

View File

@@ -35,26 +35,32 @@
#include "blis.h"
void bli_sgemmtrsm_l_opt_d4x2(
dim_t k,
float* a10,
float* a11,
float* bd01,
float* bd11,
float* b11,
float* c11, inc_t rs_c, inc_t cs_c
dim_t k,
float* restrict alpha,
float* restrict a10,
float* restrict a11,
float* restrict bd01,
float* restrict bd11,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_dgemmtrsm_l_opt_d4x2(
dim_t k,
double* a10,
double* a11,
double* bd01,
double* bd11,
double* b11,
double* c11, inc_t rs_c, inc_t cs_c
dim_t k,
double* restrict alpha,
double* restrict a10,
double* restrict a11,
double* restrict bd01,
double* restrict bd11,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
dim_t k_iter;
@@ -405,26 +411,32 @@ void bli_dgemmtrsm_l_opt_d4x2(
}
void bli_cgemmtrsm_l_opt_d4x2(
dim_t k,
scomplex* a10,
scomplex* a11,
scomplex* bd01,
scomplex* bd11,
scomplex* b11,
scomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a10,
scomplex* restrict a11,
scomplex* restrict bd01,
scomplex* restrict bd11,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zgemmtrsm_l_opt_d4x2(
dim_t k,
dcomplex* a10,
dcomplex* a11,
dcomplex* bd01,
dcomplex* bd11,
dcomplex* b11,
dcomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a10,
dcomplex* restrict a11,
dcomplex* restrict bd01,
dcomplex* restrict bd11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -37,13 +37,16 @@
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* a10, \
ctype* a11, \
ctype* bd01, \
ctype* bd11, \
ctype* b11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_d4x2 )

View File

@@ -35,26 +35,32 @@
#include "blis.h"
void bli_sgemmtrsm_u_opt_d4x2(
dim_t k,
float* a12,
float* a11,
float* bd21,
float* bd11,
float* b11,
float* c11, inc_t rs_c, inc_t cs_c
dim_t k,
float* restrict alpha,
float* restrict a12,
float* restrict a11,
float* restrict bd21,
float* restrict bd11,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_dgemmtrsm_u_opt_d4x2(
dim_t k,
double* a12,
double* a11,
double* bd21,
double* bd11,
double* b11,
double* c11, inc_t rs_c, inc_t cs_c
dim_t k,
double* restrict alpha,
double* restrict a12,
double* restrict a11,
double* restrict bd21,
double* restrict bd11,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
dim_t k_iter;
@@ -408,26 +414,32 @@ void bli_dgemmtrsm_u_opt_d4x2(
}
void bli_cgemmtrsm_u_opt_d4x2(
dim_t k,
scomplex* a12,
scomplex* a11,
scomplex* bd21,
scomplex* bd11,
scomplex* b11,
scomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a12,
scomplex* restrict a11,
scomplex* restrict bd21,
scomplex* restrict bd11,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zgemmtrsm_u_opt_d4x2(
dim_t k,
dcomplex* a12,
dcomplex* a11,
dcomplex* bd21,
dcomplex* bd11,
dcomplex* b11,
dcomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a12,
dcomplex* restrict a11,
dcomplex* restrict bd21,
dcomplex* restrict bd11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -37,13 +37,16 @@
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* a12, \
ctype* a11, \
ctype* bd21, \
ctype* bd11, \
ctype* b11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_d4x2 )

View File

@@ -35,20 +35,20 @@
#include "blis.h"
void bli_strsm_l_opt_d4x2(
float* a11,
float* b11,
float* bd11,
float* c11, inc_t rs_c, inc_t cs_c
float* restrict a11,
float* restrict b11,
float* restrict bd11,
float* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_dtrsm_l_opt_d4x2(
double* a11,
double* b11,
double* bd11,
double* c11, inc_t rs_c, inc_t cs_c
double* restrict a11,
double* restrict b11,
double* restrict bd11,
double* restrict c11, inc_t rs_c, inc_t cs_c
)
{
__asm__ volatile
@@ -185,20 +185,20 @@ void bli_dtrsm_l_opt_d4x2(
}
void bli_ctrsm_l_opt_d4x2(
scomplex* a11,
scomplex* b11,
scomplex* bd11,
scomplex* c11, inc_t rs_c, inc_t cs_c
scomplex* restrict a11,
scomplex* restrict b11,
scomplex* restrict bd11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_ztrsm_l_opt_d4x2(
dcomplex* a11,
dcomplex* b11,
dcomplex* bd11,
dcomplex* c11, inc_t rs_c, inc_t cs_c
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict bd11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -37,10 +37,10 @@
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* a11, \
ctype* b11, \
ctype* bd11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
ctype* restrict a11, \
ctype* restrict b11, \
ctype* restrict bd11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC( trsm_l_opt_d4x2 )

View File

@@ -35,24 +35,28 @@
#include "blis.h"
void bli_sgemm_opt_d4x4(
dim_t k,
float* alpha,
float* a,
float* b,
float* beta,
float* c, inc_t rs_c, inc_t cs_c
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_dgemm_opt_d4x4(
dim_t k,
double* alpha,
double* a,
double* b,
double* beta,
double* c, inc_t rs_c, inc_t cs_c
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
dim_t k_iter;
@@ -447,24 +451,28 @@ void bli_dgemm_opt_d4x4(
}
void bli_cgemm_opt_d4x4(
dim_t k,
scomplex* alpha,
scomplex* a,
scomplex* b,
scomplex* beta,
scomplex* c, inc_t rs_c, inc_t cs_c
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zgemm_opt_d4x4(
dim_t k,
dcomplex* alpha,
dcomplex* a,
dcomplex* b,
dcomplex* beta,
dcomplex* c, inc_t rs_c, inc_t cs_c
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -37,12 +37,14 @@
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* alpha, \
ctype* a, \
ctype* b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemm_opt_d4x4 )

View File

@@ -35,26 +35,30 @@
#include "blis.h"
void bli_sgemmtrsm_l_opt_d4x4(
dim_t k,
float* a10,
float* a11,
float* bd01,
float* bd11,
float* b11,
float* c11, inc_t rs_c, inc_t cs_c
dim_t k,
float* restrict a10,
float* restrict a11,
float* restrict bd01,
float* restrict bd11,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_dgemmtrsm_l_opt_d4x4(
dim_t k,
double* a10,
double* a11,
double* bd01,
double* bd11,
double* b11,
double* c11, inc_t rs_c, inc_t cs_c
dim_t k,
double* restrict a10,
double* restrict a11,
double* restrict bd01,
double* restrict bd11,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
dim_t k_iter;
@@ -500,26 +504,30 @@ void bli_dgemmtrsm_l_opt_d4x4(
}
void bli_cgemmtrsm_l_opt_d4x4(
dim_t k,
scomplex* a10,
scomplex* a11,
scomplex* bd01,
scomplex* bd11,
scomplex* b11,
scomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
scomplex* restrict a10,
scomplex* restrict a11,
scomplex* restrict bd01,
scomplex* restrict bd11,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zgemmtrsm_l_opt_d4x4(
dim_t k,
dcomplex* a10,
dcomplex* a11,
dcomplex* bd01,
dcomplex* bd11,
dcomplex* b11,
dcomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
dcomplex* restrict a10,
dcomplex* restrict a11,
dcomplex* restrict bd01,
dcomplex* restrict bd11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -37,13 +37,15 @@
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* a10, \
ctype* a11, \
ctype* bd01, \
ctype* bd11, \
ctype* b11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
dim_t k, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict bd01, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_d4x4 )

View File

@@ -35,26 +35,30 @@
#include "blis.h"
void bli_sgemmtrsm_u_opt_d4x4(
dim_t k,
float* a12,
float* a11,
float* bd21,
float* bd11,
float* b11,
float* c11, inc_t rs_c, inc_t cs_c
dim_t k,
float* restrict a12,
float* restrict a11,
float* restrict bd21,
float* restrict bd11,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_dgemmtrsm_u_opt_d4x4(
dim_t k,
double* a12,
double* a11,
double* bd21,
double* bd11,
double* b11,
double* c11, inc_t rs_c, inc_t cs_c
dim_t k,
double* restrict a12,
double* restrict a11,
double* restrict bd21,
double* restrict bd11,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
dim_t k_iter;
@@ -503,26 +507,30 @@ void bli_dgemmtrsm_u_opt_d4x4(
}
void bli_cgemmtrsm_u_opt_d4x4(
dim_t k,
scomplex* a12,
scomplex* a11,
scomplex* bd21,
scomplex* bd11,
scomplex* b11,
scomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
scomplex* restrict a12,
scomplex* restrict a11,
scomplex* restrict bd21,
scomplex* restrict bd11,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zgemmtrsm_u_opt_d4x4(
dim_t k,
dcomplex* a12,
dcomplex* a11,
dcomplex* bd21,
dcomplex* bd11,
dcomplex* b11,
dcomplex* c11, inc_t rs_c, inc_t cs_c
dim_t k,
dcomplex* restrict a12,
dcomplex* restrict a11,
dcomplex* restrict bd21,
dcomplex* restrict bd11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

View File

@@ -37,13 +37,15 @@
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* a12, \
ctype* a11, \
ctype* bd21, \
ctype* bd11, \
ctype* b11, \
ctype* c11, inc_t rs_c, inc_t cs_c \
dim_t k, \
ctype* restrict a12, \
ctype* restrict a11, \
ctype* restrict bd21, \
ctype* restrict bd11, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_d4x4 )