diff --git a/config/flame/bli_kernel.h b/config/flame/bli_kernel.h index 561c7af64..1adf8adc7 100644 --- a/config/flame/bli_kernel.h +++ b/config/flame/bli_kernel.h @@ -261,10 +261,13 @@ // -- gemm -- +//#define GEMM_UKERNEL gemm_ref_mxn #define GEMM_UKERNEL gemm_opt_d4x2 // -- trsm-related -- +//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn +//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn #define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x2 #define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x2 diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index fdb7d03e4..a50c008d4 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -173,6 +173,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict b1; \ ctype* restrict c1; \ ctype* restrict c11; \ + ctype* restrict a2; \ + ctype* restrict b2; \ \ dim_t k_nr; \ dim_t m_iter, m_left; \ @@ -240,13 +242,22 @@ void PASTEMAC(ch,varname)( \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ \ -/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ -/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter_f( i, m_iter, m_left ) ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + /*if ( i == n_iter - 1 && n_left == 0 )*/ \ + if ( bli_is_last_iter_f( i, n_iter, n_left ) ) \ + b2 = b_cast; \ + } \ \ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ @@ -254,7 +265,8 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, " a1, \ bp, \ beta_cast, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ \ a1 += rstep_a; \ c11 += rstep_c; \ @@ -263,13 +275,21 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, " /* Bottom edge handling. */ \ if ( m_left ) \ { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter_f( i, n_iter, n_left ) ) \ + b2 = b_cast; \ +\ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale the bottom edge of C and add the result from above. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, NR, \ @@ -291,17 +311,29 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, " of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ \ /* Right edge loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter_f( i, m_iter, m_left ) ) \ + { \ + a2 = a_cast; \ + b2 = b_cast; \ + } \ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale the right edge of C and add the result from above. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn)( MR, n_left, \ @@ -316,13 +348,18 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, " /* Bottom-right corner handling. */ \ if ( m_left ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a_cast; \ + b2 = b_cast; \ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale the bottom-right corner of C and add the result from above. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, n_left, \ @@ -331,6 +368,10 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, " c11, rs_c, cs_c ); \ } \ } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC( gemm_ker_var2, GEMM_UKERNEL ) diff --git a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c b/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c index d7213278c..e5d306c5c 100644 --- a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c +++ b/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c @@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ) \ { \ const dim_t m = PASTEMAC(ch,mr); \ diff --git a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.h b/frame/3/gemm/ukernels/bli_gemm_ref_mxn.h index 6a85ca9ec..927776e3d 100644 --- a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.h +++ b/frame/3/gemm/ukernels/bli_gemm_ref_mxn.h @@ -45,7 +45,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemm_ref_mxn ) diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index ac187e076..21a7736fc 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -177,6 +177,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict b1; \ ctype* restrict c1; \ ctype* restrict c11; \ + ctype* restrict a2; \ + ctype* restrict b2; \ \ doff_t diagoffc_ij; \ dim_t k_nr; \ @@ -244,12 +246,20 @@ void PASTEMAC(ch,varname)( \ columns of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b1 + cstep_b; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 && m_left == 0 ) \ + a2 = a_cast; \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements @@ -266,7 +276,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \ @@ -283,7 +294,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ beta_cast, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ \ a1 += rstep_a; \ @@ -294,13 +306,17 @@ void PASTEMAC(ch,varname)( \ to factor in here.) */ \ if ( m_left ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a_cast; \ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale the bottom edge of C and add the result. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_left, NR, \ @@ -322,12 +338,20 @@ void PASTEMAC(ch,varname)( \ of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b_cast; \ \ /* Right edge loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 && m_left == 0 ) \ + a2 = a_cast; \ \ if ( bli_intersects_diag_n( diagoffc_ij, MR, n_left ) ) \ { \ @@ -337,7 +361,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \ @@ -357,13 +382,17 @@ void PASTEMAC(ch,varname)( \ /* Bottom-right corner handling. */ \ if ( m_left ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a_cast; \ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn_l)( diagoffc_ij, \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 26bb1d904..54970031d 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -177,6 +177,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict b1; \ ctype* restrict c1; \ ctype* restrict c11; \ + ctype* restrict a2; \ + ctype* restrict b2; \ \ doff_t diagoffc_ij; \ dim_t k_nr; \ @@ -244,12 +246,20 @@ void PASTEMAC(ch,varname)( \ columns of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b1 + cstep_b; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 && m_left == 0 ) \ + a2 = a_cast; \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements @@ -266,7 +276,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \ @@ -283,7 +294,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ beta_cast, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ \ a1 += rstep_a; \ @@ -295,6 +307,9 @@ void PASTEMAC(ch,varname)( \ { \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a_cast; \ \ /* The following conditional only executes when the bottom edge case for this particular column panel happens to intersect the @@ -307,7 +322,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \ @@ -331,18 +347,27 @@ void PASTEMAC(ch,varname)( \ of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b1 + cstep_b; \ \ /* Right edge loop over the m dimension (MR rows at a time). */ \ /* (Note that the diagonal is guaranteed not to factor in here.) */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 && m_left == 0 ) \ + a2 = a_cast; \ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale the right edge of C and add the result. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn)( MR, n_left, \ @@ -360,13 +385,17 @@ void PASTEMAC(ch,varname)( \ /* Bottom-right corner handling. */ \ if ( m_left ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a_cast; \ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn_u)( diagoffc_ij, \ diff --git a/frame/3/trmm/bli_trmm_l_ker_var2.c b/frame/3/trmm/bli_trmm_l_ker_var2.c index 49877f600..39283c507 100644 --- a/frame/3/trmm/bli_trmm_l_ker_var2.c +++ b/frame/3/trmm/bli_trmm_l_ker_var2.c @@ -181,6 +181,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c1; \ ctype* restrict c11; \ ctype* restrict bp_i; \ + ctype* restrict a2; \ + ctype* restrict b2; \ \ doff_t diagoffa_i; \ dim_t m_iter, m_left; \ @@ -281,6 +283,9 @@ void PASTEMAC(ch,varname)( \ columns of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b1 + cstep_b; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -302,6 +307,11 @@ void PASTEMAC(ch,varname)( \ k_a1011 = bli_min( k, diagoffa_i + MR ); \ \ bp_i = bp + off_a1011 * NR * NDUP; \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + k_a1011 * PACKMR; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -312,7 +322,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp_i, \ beta_cast, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -327,7 +338,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp_i, \ beta_cast, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \ @@ -339,6 +351,11 @@ void PASTEMAC(ch,varname)( \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ +\ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ @@ -348,7 +365,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ one, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -358,7 +376,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \ diff --git a/frame/3/trmm/bli_trmm_u_ker_var2.c b/frame/3/trmm/bli_trmm_u_ker_var2.c index c42b0ac88..cdbf9f763 100644 --- a/frame/3/trmm/bli_trmm_u_ker_var2.c +++ b/frame/3/trmm/bli_trmm_u_ker_var2.c @@ -181,6 +181,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c1; \ ctype* restrict c11; \ ctype* restrict bp_i; \ + ctype* restrict a2; \ + ctype* restrict b2; \ \ doff_t diagoffa_i; \ dim_t m_iter, m_left; \ @@ -281,6 +283,9 @@ void PASTEMAC(ch,varname)( \ columns of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b1 + cstep_b; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -305,6 +310,11 @@ void PASTEMAC(ch,varname)( \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_u_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_u_ker_var2: b1", k_a1112, NR, bp_i, NR, 1, "%4.1f", "" );*/ \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + k_a1112 * PACKMR; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -315,7 +325,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp_i, \ beta_cast, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -330,7 +341,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp_i, \ beta_cast, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \ @@ -342,6 +354,11 @@ void PASTEMAC(ch,varname)( \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ +\ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ @@ -351,7 +368,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ one, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -361,7 +379,8 @@ void PASTEMAC(ch,varname)( \ a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC2(ch,ch,adds_mxn)( m_cur, n_cur, \ diff --git a/frame/3/trsm/bli_trsm_l_ker_var2.c b/frame/3/trsm/bli_trsm_l_ker_var2.c index 6edee51fb..c516a0a87 100644 --- a/frame/3/trsm/bli_trsm_l_ker_var2.c +++ b/frame/3/trsm/bli_trsm_l_ker_var2.c @@ -175,6 +175,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bp01; \ ctype* restrict bp11; \ ctype* restrict bp_i; \ + ctype* restrict a2; \ + ctype* restrict b2; \ \ doff_t diagoffa_i; \ dim_t m_iter, m_left; \ @@ -284,6 +286,9 @@ void PASTEMAC(ch,varname)( \ columns of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b1 + cstep_b; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ @@ -324,6 +329,11 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ */ \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + k_a1011 * PACKMR; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -336,7 +346,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR, bp01, \ bp11, \ b11, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -348,7 +359,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR, bp01, \ bp11, \ b11, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC2(ch,ch,copys_mxn)( m_cur, n_cur, \ @@ -364,6 +376,10 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR, PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ */ \ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -374,7 +390,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, " a1, \ bp, \ alpha_cast, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -384,7 +401,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, " a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_cur, n_cur, \ diff --git a/frame/3/trsm/bli_trsm_u_ker_var2.c b/frame/3/trsm/bli_trsm_u_ker_var2.c index ecb0977b1..68d853fec 100644 --- a/frame/3/trsm/bli_trsm_u_ker_var2.c +++ b/frame/3/trsm/bli_trsm_u_ker_var2.c @@ -175,6 +175,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bp21; \ ctype* restrict bp11; \ ctype* restrict bp_i; \ + ctype* restrict a2; \ + ctype* restrict b2; \ \ doff_t diagoffa_i; \ dim_t m_iter, m_left; \ @@ -284,6 +286,9 @@ void PASTEMAC(ch,varname)( \ columns of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ + /* Compute the address of the next panel of B. */ \ + b2 = b1 + cstep_b; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( ib = 0; ib < m_iter; ++ib ) \ @@ -338,7 +343,11 @@ printf( "k_a11 = %lu\n", k_a11 ); \ printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ */ \ - \ +\ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + k_a1112 * PACKMR; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -351,7 +360,8 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ bp21, \ bp11, \ b11, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -363,7 +373,8 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ bp21, \ bp11, \ b11, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: bp11 after (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ @@ -380,6 +391,11 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur, } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ + /* Compute the address of the next panel of A. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 ) \ + a2 = a_cast; \ +\ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ @@ -389,7 +405,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur, a1, \ bp, \ alpha_cast, \ - c11, rs_c, cs_c ); \ + c11, rs_c, cs_c, \ + a2, b2 ); \ } \ else \ { \ @@ -399,7 +416,8 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur, a1, \ bp, \ zero, \ - ct, rs_ct, cs_ct ); \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC3(ch,ch,ch,xpbys_mxn)( m_cur, n_cur, \ diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c index 2affe9336..bb2844103 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c @@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdT, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ) \ { \ const inc_t rs_b = PASTEMAC(ch,packnr); \ @@ -60,7 +62,9 @@ void PASTEMAC(ch,varname)( \ aL, \ bdT, \ alpha, \ - b, rs_b, cs_b ); \ + b, rs_b, cs_b, \ + a_next, \ + b_next ); \ \ /* b = inv(a) * b; bd = b; (if gemm ukernel needs duplicated B) diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h index 284acdd36..4253bf3fa 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h @@ -47,7 +47,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdT, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_mxn ) diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c index bbfbb829a..7972c306b 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c @@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdB, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ) \ { \ const inc_t rs_b = PASTEMAC(ch,packnr); \ @@ -60,7 +62,9 @@ void PASTEMAC(ch,varname)( \ aR, \ bdB, \ alpha, \ - b, rs_b, cs_b ); \ + b, rs_b, cs_b, \ + a_next, \ + b_next ); \ \ /* b = inv(a) * b; bd = b; (if gemm ukernel needs duplicated B) diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h index 63a66f6af..2f61880b2 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h @@ -47,7 +47,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdB, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_mxn ) diff --git a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c index adcc124c7..8b0d45861 100644 --- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c @@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict bd, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c \ ) \ { \ const dim_t m = PASTEMAC(ch,mr); \ diff --git a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h index 59c5a0dc9..b332bbee9 100644 --- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h +++ b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h @@ -43,7 +43,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict bd, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC( trsm_l_ref_mxn ) diff --git a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c index f67ded6f8..aa0c9ff3c 100644 --- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c @@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict bd, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c \ ) \ { \ const dim_t m = PASTEMAC(ch,mr); \ diff --git a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h index 80d7748a7..e1344c479 100644 --- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h +++ b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h @@ -43,7 +43,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict bd, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC( trsm_u_ref_mxn ) diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 3e24a12c5..ab9012d52 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -424,6 +424,10 @@ \ ( i1 != iter - 1 || left == 0 ) +#define bli_is_last_iter_f( i1, iter, left ) \ +\ + ( i1 == iter - 1 && left == 0 ) + #define bli_is_edge_b( i1, iter, left ) \ \ ( i1 == 0 && left != 0 ) @@ -432,6 +436,10 @@ \ ( i1 != 0 || left == 0 ) +#define bli_is_last_iter_b( i1, iter, left ) \ +\ + ( i1 == 0 && left == 0 ) + // packbuf_t-related diff --git a/kernels/c99/bli_gemm_ref_4x2.c b/kernels/c99/bli_gemm_ref_4x2.c index 8dc804bbc..f88a1c412 100644 --- a/kernels/c99/bli_gemm_ref_4x2.c +++ b/kernels/c99/bli_gemm_ref_4x2.c @@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ) \ { \ ctype a0; \ diff --git a/kernels/c99/bli_gemm_ref_4x2.h b/kernels/c99/bli_gemm_ref_4x2.h index 7fe857987..92d1eb20c 100644 --- a/kernels/c99/bli_gemm_ref_4x2.h +++ b/kernels/c99/bli_gemm_ref_4x2.h @@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemm_ref_4x2 ) diff --git a/kernels/c99/bli_gemm_ref_4x4.c b/kernels/c99/bli_gemm_ref_4x4.c index a4b9bd1b3..38d0a708a 100644 --- a/kernels/c99/bli_gemm_ref_4x4.c +++ b/kernels/c99/bli_gemm_ref_4x4.c @@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ) \ { \ ctype a0; \ diff --git a/kernels/c99/bli_gemm_ref_4x4.h b/kernels/c99/bli_gemm_ref_4x4.h index 6cc689886..a233d0643 100644 --- a/kernels/c99/bli_gemm_ref_4x4.h +++ b/kernels/c99/bli_gemm_ref_4x4.h @@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemm_ref_4x4 ) diff --git a/kernels/c99/bli_gemmtrsm_l_ref_4x4.c b/kernels/c99/bli_gemmtrsm_l_ref_4x4.c index 72a916b0d..1f1702937 100644 --- a/kernels/c99/bli_gemmtrsm_l_ref_4x4.c +++ b/kernels/c99/bli_gemmtrsm_l_ref_4x4.c @@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdT, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ) \ { \ ctype* minus_one = PASTEMAC(ch,m1); \ @@ -59,7 +61,9 @@ void PASTEMAC(ch,varname)( \ aL, \ bdT, \ alpha, \ - b, rs_b, cs_b ); \ + b, rs_b, cs_b, \ + a_next, \ + b_next ); \ \ PASTEMAC(ch,trsmukr)( a, \ b, \ diff --git a/kernels/c99/bli_gemmtrsm_l_ref_4x4.h b/kernels/c99/bli_gemmtrsm_l_ref_4x4.h index 955a49ade..fa2291f53 100644 --- a/kernels/c99/bli_gemmtrsm_l_ref_4x4.h +++ b/kernels/c99/bli_gemmtrsm_l_ref_4x4.h @@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdT, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_4x4 ) diff --git a/kernels/c99/bli_gemmtrsm_u_ref_4x4.c b/kernels/c99/bli_gemmtrsm_u_ref_4x4.c index 3bddd0f3c..5561793ba 100644 --- a/kernels/c99/bli_gemmtrsm_u_ref_4x4.c +++ b/kernels/c99/bli_gemmtrsm_u_ref_4x4.c @@ -46,7 +46,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdB, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ) \ { \ ctype* minus_one = PASTEMAC(ch,m1); \ @@ -59,7 +61,8 @@ void PASTEMAC(ch,varname)( \ aR, \ bdB, \ alpha, \ - b, rs_b, cs_b ); \ + b, rs_b, cs_b, \ + a_next, b_next ); \ \ PASTEMAC(ch,trsmukr)( a, \ b, \ diff --git a/kernels/c99/bli_gemmtrsm_u_ref_4x4.h b/kernels/c99/bli_gemmtrsm_u_ref_4x4.h index 5d82718ce..ac732e72b 100644 --- a/kernels/c99/bli_gemmtrsm_u_ref_4x4.h +++ b/kernels/c99/bli_gemmtrsm_u_ref_4x4.h @@ -44,7 +44,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict bdB, \ ctype* restrict bd, \ ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_4x4 ) diff --git a/kernels/x86/3/bli_gemm_opt_d2x4.c b/kernels/x86/3/bli_gemm_opt_d2x4.c index d112623e4..66f55133b 100644 --- a/kernels/x86/3/bli_gemm_opt_d2x4.c +++ b/kernels/x86/3/bli_gemm_opt_d2x4.c @@ -35,24 +35,24 @@ #include "blis.h" void bli_sgemm_opt_d2x4( - dim_t k, - float* alpha, - float* a, - float* b, - float* beta, - float* c, inc_t rs_c, inc_t cs_c + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemm_opt_d2x4( - dim_t k, - double* alpha, - double* a, - double* b, - double* beta, - double* c, inc_t rs_c, inc_t cs_c + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c ) { dim_t k_iter; @@ -366,24 +366,24 @@ void bli_dgemm_opt_d2x4( } void bli_cgemm_opt_d2x4( - dim_t k, - scomplex* alpha, - scomplex* a, - scomplex* b, - scomplex* beta, - scomplex* c, inc_t rs_c, inc_t cs_c + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemm_opt_d2x4( - dim_t k, - dcomplex* alpha, - dcomplex* a, - dcomplex* b, - dcomplex* beta, - dcomplex* c, inc_t rs_c, inc_t cs_c + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86/3/bli_gemm_opt_d2x4.h b/kernels/x86/3/bli_gemm_opt_d2x4.h index e177bcc00..2aed3b307 100644 --- a/kernels/x86/3/bli_gemm_opt_d2x4.h +++ b/kernels/x86/3/bli_gemm_opt_d2x4.h @@ -37,12 +37,12 @@ #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* alpha, \ - ctype* a, \ - ctype* b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC( gemm_opt_d2x4 ) diff --git a/kernels/x86/3/bli_gemm_opt_d4x2.c b/kernels/x86/3/bli_gemm_opt_d4x2.c index f17fe0132..00f7c208a 100644 --- a/kernels/x86/3/bli_gemm_opt_d4x2.c +++ b/kernels/x86/3/bli_gemm_opt_d4x2.c @@ -40,7 +40,9 @@ void bli_sgemm_opt_d4x2( float* restrict a, float* restrict b, float* restrict beta, - float* restrict c, inc_t rs_c, inc_t cs_c + float* restrict c, inc_t rs_c, inc_t cs_c, + float* restrict a_next, + float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); @@ -52,7 +54,9 @@ void bli_dgemm_opt_d4x2( double* restrict a, double* restrict b, double* restrict beta, - double* restrict c, inc_t rs_c, inc_t cs_c + double* restrict c, inc_t rs_c, inc_t cs_c, + double* restrict a_next, + double* restrict b_next ) { dim_t k_iter; @@ -325,7 +329,9 @@ void bli_cgemm_opt_d4x2( scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c, inc_t cs_c + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + scomplex* restrict a_next, + scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); @@ -337,7 +343,9 @@ void bli_zgemm_opt_d4x2( dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, - dcomplex* restrict c, inc_t rs_c, inc_t cs_c + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a_next, + dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86/3/bli_gemm_opt_d4x2.h b/kernels/x86/3/bli_gemm_opt_d4x2.h index 733fa7723..10598b148 100644 --- a/kernels/x86/3/bli_gemm_opt_d4x2.h +++ b/kernels/x86/3/bli_gemm_opt_d4x2.h @@ -42,7 +42,9 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemm_opt_d4x2 ) diff --git a/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c b/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c index a325f347e..0d98fe435 100644 --- a/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c +++ b/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c @@ -35,26 +35,32 @@ #include "blis.h" void bli_sgemmtrsm_l_opt_d4x2( - dim_t k, - float* a10, - float* a11, - float* bd01, - float* bd11, - float* b11, - float* c11, inc_t rs_c, inc_t cs_c + dim_t k, + float* restrict alpha, + float* restrict a10, + float* restrict a11, + float* restrict bd01, + float* restrict bd11, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + float* restrict a_next, + float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemmtrsm_l_opt_d4x2( - dim_t k, - double* a10, - double* a11, - double* bd01, - double* bd11, - double* b11, - double* c11, inc_t rs_c, inc_t cs_c + dim_t k, + double* restrict alpha, + double* restrict a10, + double* restrict a11, + double* restrict bd01, + double* restrict bd11, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + double* restrict a_next, + double* restrict b_next ) { dim_t k_iter; @@ -405,26 +411,32 @@ void bli_dgemmtrsm_l_opt_d4x2( } void bli_cgemmtrsm_l_opt_d4x2( - dim_t k, - scomplex* a10, - scomplex* a11, - scomplex* bd01, - scomplex* bd11, - scomplex* b11, - scomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a10, + scomplex* restrict a11, + scomplex* restrict bd01, + scomplex* restrict bd11, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + scomplex* restrict a_next, + scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemmtrsm_l_opt_d4x2( - dim_t k, - dcomplex* a10, - dcomplex* a11, - dcomplex* bd01, - dcomplex* bd11, - dcomplex* b11, - dcomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a10, + dcomplex* restrict a11, + dcomplex* restrict bd01, + dcomplex* restrict bd11, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a_next, + dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.h b/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.h index 9a61ff9fa..1e6167e69 100644 --- a/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.h +++ b/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.h @@ -37,13 +37,16 @@ #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* a10, \ - ctype* a11, \ - ctype* bd01, \ - ctype* bd11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a10, \ + ctype* restrict a11, \ + ctype* restrict bd01, \ + ctype* restrict bd11, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_d4x2 ) diff --git a/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.c b/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.c index 4b41ffbca..bb12b738d 100644 --- a/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.c +++ b/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.c @@ -35,26 +35,32 @@ #include "blis.h" void bli_sgemmtrsm_u_opt_d4x2( - dim_t k, - float* a12, - float* a11, - float* bd21, - float* bd11, - float* b11, - float* c11, inc_t rs_c, inc_t cs_c + dim_t k, + float* restrict alpha, + float* restrict a12, + float* restrict a11, + float* restrict bd21, + float* restrict bd11, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + float* restrict a_next, + float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemmtrsm_u_opt_d4x2( - dim_t k, - double* a12, - double* a11, - double* bd21, - double* bd11, - double* b11, - double* c11, inc_t rs_c, inc_t cs_c + dim_t k, + double* restrict alpha, + double* restrict a12, + double* restrict a11, + double* restrict bd21, + double* restrict bd11, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + double* restrict a_next, + double* restrict b_next ) { dim_t k_iter; @@ -408,26 +414,32 @@ void bli_dgemmtrsm_u_opt_d4x2( } void bli_cgemmtrsm_u_opt_d4x2( - dim_t k, - scomplex* a12, - scomplex* a11, - scomplex* bd21, - scomplex* bd11, - scomplex* b11, - scomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a12, + scomplex* restrict a11, + scomplex* restrict bd21, + scomplex* restrict bd11, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + scomplex* restrict a_next, + scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemmtrsm_u_opt_d4x2( - dim_t k, - dcomplex* a12, - dcomplex* a11, - dcomplex* bd21, - dcomplex* bd11, - dcomplex* b11, - dcomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a12, + dcomplex* restrict a11, + dcomplex* restrict bd21, + dcomplex* restrict bd11, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a_next, + dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.h b/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.h index 3a43b0fe8..42c7622e0 100644 --- a/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.h +++ b/kernels/x86/3/bli_gemmtrsm_u_opt_d4x2.h @@ -37,13 +37,16 @@ #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* a12, \ - ctype* a11, \ - ctype* bd21, \ - ctype* bd11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a12, \ + ctype* restrict a11, \ + ctype* restrict bd21, \ + ctype* restrict bd11, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_d4x2 ) diff --git a/kernels/x86/3/bli_trsm_l_opt_d4x2.c b/kernels/x86/3/bli_trsm_l_opt_d4x2.c index 94e5e763a..b3e7935dd 100644 --- a/kernels/x86/3/bli_trsm_l_opt_d4x2.c +++ b/kernels/x86/3/bli_trsm_l_opt_d4x2.c @@ -35,20 +35,20 @@ #include "blis.h" void bli_strsm_l_opt_d4x2( - float* a11, - float* b11, - float* bd11, - float* c11, inc_t rs_c, inc_t cs_c + float* restrict a11, + float* restrict b11, + float* restrict bd11, + float* restrict c11, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dtrsm_l_opt_d4x2( - double* a11, - double* b11, - double* bd11, - double* c11, inc_t rs_c, inc_t cs_c + double* restrict a11, + double* restrict b11, + double* restrict bd11, + double* restrict c11, inc_t rs_c, inc_t cs_c ) { __asm__ volatile @@ -185,20 +185,20 @@ void bli_dtrsm_l_opt_d4x2( } void bli_ctrsm_l_opt_d4x2( - scomplex* a11, - scomplex* b11, - scomplex* bd11, - scomplex* c11, inc_t rs_c, inc_t cs_c + scomplex* restrict a11, + scomplex* restrict b11, + scomplex* restrict bd11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_ztrsm_l_opt_d4x2( - dcomplex* a11, - dcomplex* b11, - dcomplex* bd11, - dcomplex* c11, inc_t rs_c, inc_t cs_c + dcomplex* restrict a11, + dcomplex* restrict b11, + dcomplex* restrict bd11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86/3/bli_trsm_l_opt_d4x2.h b/kernels/x86/3/bli_trsm_l_opt_d4x2.h index 59879f2c2..4a27fb796 100644 --- a/kernels/x86/3/bli_trsm_l_opt_d4x2.h +++ b/kernels/x86/3/bli_trsm_l_opt_d4x2.h @@ -37,10 +37,10 @@ #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ - ctype* a11, \ - ctype* b11, \ - ctype* bd11, \ - ctype* c11, inc_t rs_c, inc_t cs_c \ + ctype* restrict a11, \ + ctype* restrict b11, \ + ctype* restrict bd11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC( trsm_l_opt_d4x2 ) diff --git a/kernels/x86_64/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/3/bli_gemm_opt_d4x4.c index 146a4bd01..5bd4bf5b8 100644 --- a/kernels/x86_64/3/bli_gemm_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemm_opt_d4x4.c @@ -35,24 +35,28 @@ #include "blis.h" void bli_sgemm_opt_d4x4( - dim_t k, - float* alpha, - float* a, - float* b, - float* beta, - float* c, inc_t rs_c, inc_t cs_c + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + float* restrict a_next, + float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemm_opt_d4x4( - dim_t k, - double* alpha, - double* a, - double* b, - double* beta, - double* c, inc_t rs_c, inc_t cs_c + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + double* restrict a_next, + double* restrict b_next ) { dim_t k_iter; @@ -447,24 +451,28 @@ void bli_dgemm_opt_d4x4( } void bli_cgemm_opt_d4x4( - dim_t k, - scomplex* alpha, - scomplex* a, - scomplex* b, - scomplex* beta, - scomplex* c, inc_t rs_c, inc_t cs_c + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + scomplex* restrict a_next, + scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemm_opt_d4x4( - dim_t k, - dcomplex* alpha, - dcomplex* a, - dcomplex* b, - dcomplex* beta, - dcomplex* c, inc_t rs_c, inc_t cs_c + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a_next, + dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86_64/3/bli_gemm_opt_d4x4.h b/kernels/x86_64/3/bli_gemm_opt_d4x4.h index 331a59e72..43304920c 100644 --- a/kernels/x86_64/3/bli_gemm_opt_d4x4.h +++ b/kernels/x86_64/3/bli_gemm_opt_d4x4.h @@ -37,12 +37,14 @@ #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* alpha, \ - ctype* a, \ - ctype* b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemm_opt_d4x4 ) diff --git a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c index 8538fbaa3..fc1f3e389 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c @@ -35,26 +35,30 @@ #include "blis.h" void bli_sgemmtrsm_l_opt_d4x4( - dim_t k, - float* a10, - float* a11, - float* bd01, - float* bd11, - float* b11, - float* c11, inc_t rs_c, inc_t cs_c + dim_t k, + float* restrict a10, + float* restrict a11, + float* restrict bd01, + float* restrict bd11, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + float* restrict a_next, + float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemmtrsm_l_opt_d4x4( - dim_t k, - double* a10, - double* a11, - double* bd01, - double* bd11, - double* b11, - double* c11, inc_t rs_c, inc_t cs_c + dim_t k, + double* restrict a10, + double* restrict a11, + double* restrict bd01, + double* restrict bd11, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + double* restrict a_next, + double* restrict b_next ) { dim_t k_iter; @@ -500,26 +504,30 @@ void bli_dgemmtrsm_l_opt_d4x4( } void bli_cgemmtrsm_l_opt_d4x4( - dim_t k, - scomplex* a10, - scomplex* a11, - scomplex* bd01, - scomplex* bd11, - scomplex* b11, - scomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + scomplex* restrict a10, + scomplex* restrict a11, + scomplex* restrict bd01, + scomplex* restrict bd11, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + scomplex* restrict a_next, + scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemmtrsm_l_opt_d4x4( - dim_t k, - dcomplex* a10, - dcomplex* a11, - dcomplex* bd01, - dcomplex* bd11, - dcomplex* b11, - dcomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + dcomplex* restrict a10, + dcomplex* restrict a11, + dcomplex* restrict bd01, + dcomplex* restrict bd11, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a_next, + dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h index 7891b0d4e..11962c88b 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h +++ b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h @@ -37,13 +37,15 @@ #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* a10, \ - ctype* a11, \ - ctype* bd01, \ - ctype* bd11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c \ + dim_t k, \ + ctype* restrict a10, \ + ctype* restrict a11, \ + ctype* restrict bd01, \ + ctype* restrict bd11, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_d4x4 ) diff --git a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c index 7b15e89ec..469964f45 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c @@ -35,26 +35,30 @@ #include "blis.h" void bli_sgemmtrsm_u_opt_d4x4( - dim_t k, - float* a12, - float* a11, - float* bd21, - float* bd11, - float* b11, - float* c11, inc_t rs_c, inc_t cs_c + dim_t k, + float* restrict a12, + float* restrict a11, + float* restrict bd21, + float* restrict bd11, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + float* restrict a_next, + float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemmtrsm_u_opt_d4x4( - dim_t k, - double* a12, - double* a11, - double* bd21, - double* bd11, - double* b11, - double* c11, inc_t rs_c, inc_t cs_c + dim_t k, + double* restrict a12, + double* restrict a11, + double* restrict bd21, + double* restrict bd11, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + double* restrict a_next, + double* restrict b_next ) { dim_t k_iter; @@ -503,26 +507,30 @@ void bli_dgemmtrsm_u_opt_d4x4( } void bli_cgemmtrsm_u_opt_d4x4( - dim_t k, - scomplex* a12, - scomplex* a11, - scomplex* bd21, - scomplex* bd11, - scomplex* b11, - scomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + scomplex* restrict a12, + scomplex* restrict a11, + scomplex* restrict bd21, + scomplex* restrict bd11, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + scomplex* restrict a_next, + scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemmtrsm_u_opt_d4x4( - dim_t k, - dcomplex* a12, - dcomplex* a11, - dcomplex* bd21, - dcomplex* bd11, - dcomplex* b11, - dcomplex* c11, inc_t rs_c, inc_t cs_c + dim_t k, + dcomplex* restrict a12, + dcomplex* restrict a11, + dcomplex* restrict bd21, + dcomplex* restrict bd11, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a_next, + dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h index 1071d0eac..c88a696a2 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h +++ b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h @@ -37,13 +37,15 @@ #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* a12, \ - ctype* a11, \ - ctype* bd21, \ - ctype* bd11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c \ + dim_t k, \ + ctype* restrict a12, \ + ctype* restrict a11, \ + ctype* restrict bd21, \ + ctype* restrict bd11, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ ); INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_d4x4 )