From 2d9c667f3c48a12cab64e5ad09d5fcb9f4c19d78 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 24 May 2013 16:28:10 -0500 Subject: [PATCH] Fixed x86_64 kernel bugs and other minor issues. Details: - Fixed bugs in trmv_l and trsv_u due to backwards iteration resulting in unaligned subpartitions. We were already going out of our way a bit to handle edge cases in the first iteration for blocked variants, and this was simply the unblocked-fused extension of that idea. - Fixed control tree handling in her/her2/syr/syr2 that was not taking into account how the choice of variant needed to be altered for upper-stored matrices (given that only lower-stored algorithms are explicitly implemented). - Added bli_determine_blocksize_dim_f(), bli_determine_blocksize_dim_b() macros to provide inlined versions of bli_determine_blocksize_[fb]() for use by unblocked-fused variants. - Integrated new blocksize_dim macros into gemv/hemv unf variants for consistency with that of the bugfix for trmv/trsv (both of which now use the same macros). - Modified bli_obj_vector_inc() so that 1 is returned if the object is a vector of length 1 (ie: 1 x 1). This fixes a bug whereby under certain conditions (e.g. dotv_opt_var1), an invalid increment was returned, which was invalid only because the code was expecting 1 (for purposes of performing contiguous vector loads) but got a value greater than 1 because the column stride of the object (e.g. rho) was inflated for alignment purposes (albeit unnecessarily since there is only one element in the object). - Replaced some old invocations of set0 with set0s. - Added alpha parameter to gemmtrsm ukernels for x86_64 and use accordingly. - Fixed increment bug in cleanup loop of gemm ukernel for x86_64. - Added safeguard to test modules so that testing a problem with a zero dimension does not result in a failure. - Tweaked handling of zero dimensions in level-2 and level-3 operations' internal back-ends to correctly handle cases where output operand still needs to be scaled (e.g. by beta, in the case of gemm with k = 0). --- config/clarksville/bli_kernel.h | 4 +-- frame/2/gemv/bli_gemv_int.c | 12 +++++-- frame/2/gemv/bli_gemv_unf_var1.c | 2 +- frame/2/gemv/bli_gemv_unf_var2.c | 2 +- frame/2/ger/bli_ger_int.c | 11 +++++-- frame/2/hemv/bli_hemv_int.c | 12 +++++-- frame/2/hemv/bli_hemv_unf_var1.c | 2 +- frame/2/hemv/bli_hemv_unf_var3.c | 2 +- frame/2/her/bli_her.c | 38 ++++++++++++++++------ frame/2/her/bli_her_cntl.c | 32 +++++++++--------- frame/2/her/bli_her_int.c | 4 +-- frame/2/her2/bli_her2.c | 38 ++++++++++++++++------ frame/2/her2/bli_her2_cntl.c | 32 +++++++++--------- frame/2/her2/bli_her2_int.c | 4 +-- frame/2/syr/bli_syr.c | 38 ++++++++++++++++------ frame/2/syr2/bli_syr2.c | 38 ++++++++++++++++------ frame/2/trmv/bli_trmv_int.c | 3 +- frame/2/trmv/bli_trmv_unf_var1.c | 14 ++++---- frame/2/trmv/bli_trmv_unf_var2.c | 14 ++++---- frame/2/trsv/bli_trsv_int.c | 3 +- frame/2/trsv/bli_trsv_unf_var1.c | 14 ++++---- frame/2/trsv/bli_trsv_unf_var2.c | 14 ++++---- frame/3/gemm/bli_gemm_int.c | 12 +++++-- frame/3/her2k/bli_her2k_int.c | 18 ++++++---- frame/3/herk/bli_herk_int.c | 14 +++++--- frame/3/trmm/bli_trmm_int.c | 12 +++++-- frame/3/trsm/bli_trsm_int.c | 12 +++++-- frame/include/bli_obj_macro_defs.h | 9 ++++- frame/include/bli_param_macro_defs.h | 13 ++++++++ kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c | 13 ++++---- kernels/x86_64/1/bli_dotv_opt_var1.c | 8 ++--- kernels/x86_64/1f/bli_dotaxpyv_opt_var1.c | 4 +-- kernels/x86_64/1f/bli_dotxaxpyf_opt_var1.c | 10 +++--- kernels/x86_64/1f/bli_dotxf_opt_var1.c | 8 ++--- kernels/x86_64/3/bli_gemm_opt_d4x4.c | 4 +-- kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c | 23 +++++++++++-- kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h | 1 + kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c | 25 +++++++++++--- kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h | 1 + testsuite/input.operations | 2 +- testsuite/src/test_addm.c | 3 ++ testsuite/src/test_addv.c | 3 ++ testsuite/src/test_axpym.c | 3 ++ testsuite/src/test_axpyv.c | 3 ++ testsuite/src/test_copym.c | 3 ++ testsuite/src/test_copyv.c | 3 ++ testsuite/src/test_dotv.c | 3 ++ testsuite/src/test_dotxv.c | 3 ++ testsuite/src/test_fnormm.c | 3 ++ testsuite/src/test_fnormv.c | 3 ++ testsuite/src/test_gemm.c | 3 ++ testsuite/src/test_gemv.c | 3 ++ testsuite/src/test_ger.c | 3 ++ testsuite/src/test_hemm.c | 3 ++ testsuite/src/test_hemv.c | 3 ++ testsuite/src/test_her.c | 3 ++ testsuite/src/test_her2.c | 3 ++ testsuite/src/test_her2k.c | 3 ++ testsuite/src/test_herk.c | 3 ++ testsuite/src/test_libblis.c | 9 +++++ testsuite/src/test_libblis.h | 4 +++ testsuite/src/test_randm.c | 3 ++ testsuite/src/test_randv.c | 3 ++ testsuite/src/test_scal2m.c | 3 ++ testsuite/src/test_scal2v.c | 3 ++ testsuite/src/test_scalm.c | 3 ++ testsuite/src/test_scalv.c | 3 ++ testsuite/src/test_setm.c | 3 ++ testsuite/src/test_setv.c | 3 ++ testsuite/src/test_subm.c | 3 ++ testsuite/src/test_subv.c | 3 ++ testsuite/src/test_symm.c | 3 ++ testsuite/src/test_symv.c | 3 ++ testsuite/src/test_syr.c | 3 ++ testsuite/src/test_syr2.c | 3 ++ testsuite/src/test_syr2k.c | 3 ++ testsuite/src/test_syrk.c | 3 ++ testsuite/src/test_trmm.c | 3 ++ testsuite/src/test_trmm3.c | 3 ++ testsuite/src/test_trmv.c | 3 ++ testsuite/src/test_trsm.c | 3 ++ testsuite/src/test_trsv.c | 3 ++ 82 files changed, 480 insertions(+), 175 deletions(-) diff --git a/config/clarksville/bli_kernel.h b/config/clarksville/bli_kernel.h index e9b2f9981..ebcbe338d 100644 --- a/config/clarksville/bli_kernel.h +++ b/config/clarksville/bli_kernel.h @@ -265,8 +265,8 @@ #define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x4 #define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x4 -#define TRSM_L_UKERNEL trsm_l_ref_4x4 -#define TRSM_U_UKERNEL trsm_u_ref_4x4 +#define TRSM_L_UKERNEL trsm_l_ref_mxn +#define TRSM_U_UKERNEL trsm_u_ref_mxn diff --git a/frame/2/gemv/bli_gemv_int.c b/frame/2/gemv/bli_gemv_int.c index 8442bed83..10812ba5b 100644 --- a/frame/2/gemv/bli_gemv_int.c +++ b/frame/2/gemv/bli_gemv_int.c @@ -75,11 +75,17 @@ void bli_gemv_int( trans_t transa, if ( bli_error_checking_is_enabled() ) bli_gemv_int_check( alpha, &a_local, &x_local, beta, y, cntl ); - // Return early if one of the operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; - if ( bli_obj_has_zero_dim( *x ) ) return; + // If y has a zero dimension, return early. if ( bli_obj_has_zero_dim( *y ) ) return; + // If A or x has a zero dimension, scale y by beta and return early. + if ( bli_obj_has_zero_dim( *a ) || + bli_obj_has_zero_dim( *x ) ) + { + bli_scalm( beta, y ); + return; + } + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index d0d0497d5..c58f7edc2 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -164,7 +164,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \ \ for ( i = 0; i < n_iter; i += f ) \ { \ - f = bli_min( n_iter - i, b_fuse ); \ + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a_cast + (i )*rs_at + (0 )*cs_at; \ x1 = x_cast + (0 )*incy; \ diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index 9cfff5ac8..515c33a17 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -182,7 +182,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \ \ for ( i = 0; i < n_iter; i += f ) \ { \ - f = bli_min( n_iter - i, b_fuse ); \ + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a_cast + (0 )*rs_at + (i )*cs_at; \ x1 = x_cast + (i )*incx; \ diff --git a/frame/2/ger/bli_ger_int.c b/frame/2/ger/bli_ger_int.c index b31e72945..59a0b3fbb 100644 --- a/frame/2/ger/bli_ger_int.c +++ b/frame/2/ger/bli_ger_int.c @@ -71,11 +71,16 @@ void bli_ger_int( conj_t conjx, if ( bli_error_checking_is_enabled() ) bli_ger_int_check( alpha, x, y, a, cntl ); - // Return early if one of the operands has a zero dimension. - if ( bli_obj_has_zero_dim( *x ) ) return; - if ( bli_obj_has_zero_dim( *y ) ) return; + // If A has a zero dimension, return early. if ( bli_obj_has_zero_dim( *a ) ) return; + // If x or y has a zero dimension, return early. + if ( bli_obj_has_zero_dim( *x ) || + bli_obj_has_zero_dim( *y ) ) + { + return; + } + // Alias the objects, applying conjx and conjy to x and y, respectively. bli_obj_alias_with_conj( conjx, *x, x_local ); bli_obj_alias_with_conj( conjy, *y, y_local ); diff --git a/frame/2/hemv/bli_hemv_int.c b/frame/2/hemv/bli_hemv_int.c index f17fd4524..5fc157fcc 100644 --- a/frame/2/hemv/bli_hemv_int.c +++ b/frame/2/hemv/bli_hemv_int.c @@ -70,11 +70,17 @@ void bli_hemv_int( conj_t conjh, if ( bli_error_checking_is_enabled() ) bli_hemv_int_check( conjh, alpha, a, x, beta, y, cntl ); - // Return early if one of the operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; - if ( bli_obj_has_zero_dim( *x ) ) return; + // If y has a zero dimension, return early. if ( bli_obj_has_zero_dim( *y ) ) return; + // If A or x has a zero dimension, scale y by beta and return early. + if ( bli_obj_has_zero_dim( *a ) || + bli_obj_has_zero_dim( *x ) ) + { + bli_scalm( beta, y ); + return; + } + // Alias A in case we need to induce the upper triangular case. bli_obj_alias_to( *a, a_local ); diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c index e6228036d..228a50b0f 100644 --- a/frame/2/hemv/bli_hemv_unf_var1.c +++ b/frame/2/hemv/bli_hemv_unf_var1.c @@ -215,7 +215,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \ \ for ( i = 0; i < m; i += f ) \ { \ - f = bli_min( m - i, b_fuse ); \ + f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \ n_behind = i; \ A10 = a_cast + (i )*rs_at + (0 )*cs_at; \ A11 = a_cast + (i )*rs_at + (i )*cs_at; \ diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c index 8f71b7640..17fa46d16 100644 --- a/frame/2/hemv/bli_hemv_unf_var3.c +++ b/frame/2/hemv/bli_hemv_unf_var3.c @@ -233,7 +233,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \ \ for ( i = 0; i < m; i += f ) \ { \ - f = bli_min( m - i, b_fuse ); \ + f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \ n_ahead = m - i - f; \ A11 = a_cast + (i )*rs_at + (i )*cs_at; \ A21 = a_cast + (i+f)*rs_at + (i )*cs_at; \ diff --git a/frame/2/her/bli_her.c b/frame/2/her/bli_her.c index 249af431f..5cc4882ba 100644 --- a/frame/2/her/bli_her.c +++ b/frame/2/her/bli_her.c @@ -34,10 +34,10 @@ #include "blis.h" -extern her_t* her_cntl_bs_ke_row; -extern her_t* her_cntl_bs_ke_col; -extern her_t* her_cntl_ge_row; -extern her_t* her_cntl_ge_col; +extern her_t* her_cntl_bs_ke_lrow_ucol; +extern her_t* her_cntl_bs_ke_lcol_urow; +extern her_t* her_cntl_ge_lrow_ucol; +extern her_t* her_cntl_ge_lcol_urow; void bli_her( obj_t* alpha, obj_t* x, @@ -79,10 +79,20 @@ void bli_her( obj_t* alpha, if ( x_is_contig && c_is_contig ) { - // Use different control trees depending on storage of the matrix - // operand. - if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_row; - else her_cntl = her_cntl_bs_ke_col; + // We use two control trees to handle the four cases corresponding to + // combinations of upper/lower triangular storage and row/column-storage. + // The row-stored lower triangular and column-stored upper triangular + // trees are identical. Same for the remaining two trees. + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol; + else her_cntl = her_cntl_bs_ke_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lcol_urow; + else her_cntl = her_cntl_bs_ke_lrow_ucol; + } } else { @@ -93,8 +103,16 @@ void bli_her( obj_t* alpha, // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. - if ( bli_obj_is_row_tilted( *c ) ) her_cntl = her_cntl_ge_row; - else her_cntl = her_cntl_ge_col; + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lrow_ucol; + else her_cntl = her_cntl_ge_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lcol_urow; + else her_cntl = her_cntl_ge_lrow_ucol; + } } diff --git a/frame/2/her/bli_her_cntl.c b/frame/2/her/bli_her_cntl.c index 3197dac49..c3e94d993 100644 --- a/frame/2/her/bli_her_cntl.c +++ b/frame/2/her/bli_her_cntl.c @@ -45,11 +45,11 @@ extern ger_t* ger_cntl_bs_ke_col; static blksz_t* her_mc; -her_t* her_cntl_bs_ke_row; -her_t* her_cntl_bs_ke_col; +her_t* her_cntl_bs_ke_lrow_ucol; +her_t* her_cntl_bs_ke_lcol_urow; -her_t* her_cntl_ge_row; -her_t* her_cntl_ge_col; +her_t* her_cntl_ge_lrow_ucol; +her_t* her_cntl_ge_lcol_urow; // Cache blocksizes. @@ -71,13 +71,13 @@ void bli_her_cntl_init() // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. - her_cntl_bs_ke_row + her_cntl_bs_ke_lrow_ucol = bli_her_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL ); - her_cntl_bs_ke_col + her_cntl_bs_ke_lcol_urow = bli_her_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT2, @@ -88,34 +88,34 @@ void bli_her_cntl_init() // Create control trees for generally large problems. Here, we choose // variants that partition for ger subproblems in the same direction // as the assumed storage. - her_cntl_ge_row + her_cntl_ge_lrow_ucol = bli_her_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // use var1 for row storage + BLIS_VARIANT1, her_mc, packv_cntl, // pack x1 (if needed) NULL, // do NOT pack C11 ger_cntl_rp_bs_row, - her_cntl_bs_ke_row, + her_cntl_bs_ke_lrow_ucol, NULL ); // no unpacking needed - her_cntl_ge_col + her_cntl_ge_lcol_urow = bli_her_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, // use var2 for col storage + BLIS_VARIANT2, her_mc, packv_cntl, // pack x1 (if needed) NULL, // do NOT pack C11 ger_cntl_cp_bs_col, - her_cntl_bs_ke_col, + her_cntl_bs_ke_lcol_urow, NULL ); // no unpacking needed } void bli_her_cntl_finalize() { - bli_cntl_obj_free( her_cntl_bs_ke_row ); - bli_cntl_obj_free( her_cntl_bs_ke_col ); - bli_cntl_obj_free( her_cntl_ge_row ); - bli_cntl_obj_free( her_cntl_ge_col ); + bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol ); + bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow ); + bli_cntl_obj_free( her_cntl_ge_lrow_ucol ); + bli_cntl_obj_free( her_cntl_ge_lcol_urow ); } diff --git a/frame/2/her/bli_her_int.c b/frame/2/her/bli_her_int.c index 28e181ae4..2d9a8ec39 100644 --- a/frame/2/her/bli_her_int.c +++ b/frame/2/her/bli_her_int.c @@ -67,9 +67,9 @@ void bli_her_int( conj_t conjh, if ( bli_error_checking_is_enabled() ) bli_her_int_check( conjh, alpha, x, c, cntl ); - // Return early if one of the operands has a zero dimension. - if ( bli_obj_has_zero_dim( *x ) ) return; + // If C or x has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; + if ( bli_obj_has_zero_dim( *x ) ) return; // Alias the operands in case we need to apply conjugations. bli_obj_alias_to( *x, x_local ); diff --git a/frame/2/her2/bli_her2.c b/frame/2/her2/bli_her2.c index 1a282898c..1e99d9f54 100644 --- a/frame/2/her2/bli_her2.c +++ b/frame/2/her2/bli_her2.c @@ -34,10 +34,10 @@ #include "blis.h" -extern her2_t* her2_cntl_bs_ke_row; -extern her2_t* her2_cntl_bs_ke_col; -extern her2_t* her2_cntl_ge_row; -extern her2_t* her2_cntl_ge_col; +extern her2_t* her2_cntl_bs_ke_lrow_ucol; +extern her2_t* her2_cntl_bs_ke_lcol_urow; +extern her2_t* her2_cntl_ge_lrow_ucol; +extern her2_t* her2_cntl_ge_lcol_urow; void bli_her2( obj_t* alpha, obj_t* x, @@ -93,10 +93,20 @@ void bli_her2( obj_t* alpha, y_is_contig && c_is_contig ) { - // Use different control trees depending on storage of the matrix - // operand. - if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_row; - else her2_cntl = her2_cntl_bs_ke_col; + // We use two control trees to handle the four cases corresponding to + // combinations of upper/lower triangular storage and row/column-storage. + // The row-stored lower triangular and column-stored upper triangular + // trees are identical. Same for the remaining two trees. + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol; + else her2_cntl = her2_cntl_bs_ke_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow; + else her2_cntl = her2_cntl_bs_ke_lrow_ucol; + } } else { @@ -108,8 +118,16 @@ void bli_her2( obj_t* alpha, // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. - if ( bli_obj_is_row_tilted( *c ) ) her2_cntl = her2_cntl_ge_row; - else her2_cntl = her2_cntl_ge_col; + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lrow_ucol; + else her2_cntl = her2_cntl_ge_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lcol_urow; + else her2_cntl = her2_cntl_ge_lrow_ucol; + } } diff --git a/frame/2/her2/bli_her2_cntl.c b/frame/2/her2/bli_her2_cntl.c index 2b3983f2d..117c77d18 100644 --- a/frame/2/her2/bli_her2_cntl.c +++ b/frame/2/her2/bli_her2_cntl.c @@ -43,11 +43,11 @@ extern ger_t* ger_cntl_cp_bs_col; static blksz_t* her2_mc; -her2_t* her2_cntl_bs_ke_row; -her2_t* her2_cntl_bs_ke_col; +her2_t* her2_cntl_bs_ke_lrow_ucol; +her2_t* her2_cntl_bs_ke_lcol_urow; -her2_t* her2_cntl_ge_row; -her2_t* her2_cntl_ge_col; +her2_t* her2_cntl_ge_lrow_ucol; +her2_t* her2_cntl_ge_lcol_urow; // Cache blocksizes. @@ -69,14 +69,14 @@ void bli_her2_cntl_init() // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. - her2_cntl_bs_ke_row + her2_cntl_bs_ke_lrow_ucol = bli_her2_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); - her2_cntl_bs_ke_col + her2_cntl_bs_ke_lcol_urow = bli_her2_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT4, @@ -88,38 +88,38 @@ void bli_her2_cntl_init() // Create control trees for generally large problems. Here, we choose // variants that partition for ger subproblems in the same direction // as the assumed storage. - her2_cntl_ge_row + her2_cntl_ge_lrow_ucol = bli_her2_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // use var1 for row storage + BLIS_VARIANT1, her2_mc, packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) packm_cntl_noscale, // pack C11 (if needed) ger_cntl_rp_bs_row, ger_cntl_rp_bs_row, - her2_cntl_bs_ke_row, + her2_cntl_bs_ke_lrow_ucol, unpackm_cntl ); // unpack C11 (if packed) - her2_cntl_ge_col + her2_cntl_ge_lcol_urow = bli_her2_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT4, // use var4 for col storage + BLIS_VARIANT4, her2_mc, packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) packm_cntl_noscale, // pack C11 (if needed) ger_cntl_cp_bs_col, ger_cntl_cp_bs_col, - her2_cntl_bs_ke_col, + her2_cntl_bs_ke_lcol_urow, unpackm_cntl ); // unpack C11 (if packed) } void bli_her2_cntl_finalize() { - bli_cntl_obj_free( her2_cntl_bs_ke_row ); - bli_cntl_obj_free( her2_cntl_bs_ke_col ); - bli_cntl_obj_free( her2_cntl_ge_row ); - bli_cntl_obj_free( her2_cntl_ge_col ); + bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol ); + bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow ); + bli_cntl_obj_free( her2_cntl_ge_lrow_ucol ); + bli_cntl_obj_free( her2_cntl_ge_lcol_urow ); } diff --git a/frame/2/her2/bli_her2_int.c b/frame/2/her2/bli_her2_int.c index cd498f001..e40dc3c13 100644 --- a/frame/2/her2/bli_her2_int.c +++ b/frame/2/her2/bli_her2_int.c @@ -74,10 +74,10 @@ void bli_her2_int( conj_t conjh, if ( bli_error_checking_is_enabled() ) bli_her2_int_check( conjh, alpha, x, y, c, cntl ); - // Return early if one of the operands has a zero dimension. + // If C, x, or y has a zero dimension, return early. + if ( bli_obj_has_zero_dim( *c ) ) return; if ( bli_obj_has_zero_dim( *x ) ) return; if ( bli_obj_has_zero_dim( *y ) ) return; - if ( bli_obj_has_zero_dim( *c ) ) return; // Alias the operands in case we need to apply conjugations. bli_obj_alias_to( *x, x_local ); diff --git a/frame/2/syr/bli_syr.c b/frame/2/syr/bli_syr.c index 3f14ed632..e3cd31070 100644 --- a/frame/2/syr/bli_syr.c +++ b/frame/2/syr/bli_syr.c @@ -34,10 +34,10 @@ #include "blis.h" -extern her_t* her_cntl_bs_ke_row; -extern her_t* her_cntl_bs_ke_col; -extern her_t* her_cntl_ge_row; -extern her_t* her_cntl_ge_col; +extern her_t* her_cntl_bs_ke_lrow_ucol; +extern her_t* her_cntl_bs_ke_lcol_urow; +extern her_t* her_cntl_ge_lrow_ucol; +extern her_t* her_cntl_ge_lcol_urow; void bli_syr( obj_t* alpha, obj_t* x, @@ -81,10 +81,20 @@ void bli_syr( obj_t* alpha, if ( x_is_contig && c_is_contig ) { - // Use different control trees depending on storage of the matrix - // operand. - if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_row; - else her_cntl = her_cntl_bs_ke_col; + // We use two control trees to handle the four cases corresponding to + // combinations of upper/lower triangular storage and row/column-storage. + // The row-stored lower triangular and column-stored upper triangular + // trees are identical. Same for the remaining two trees. + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol; + else her_cntl = her_cntl_bs_ke_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lcol_urow; + else her_cntl = her_cntl_bs_ke_lrow_ucol; + } } else { @@ -95,8 +105,16 @@ void bli_syr( obj_t* alpha, // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. - if ( bli_obj_is_row_tilted( *c ) ) her_cntl = her_cntl_ge_row; - else her_cntl = her_cntl_ge_col; + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lrow_ucol; + else her_cntl = her_cntl_ge_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lcol_urow; + else her_cntl = her_cntl_ge_lrow_ucol; + } } diff --git a/frame/2/syr2/bli_syr2.c b/frame/2/syr2/bli_syr2.c index 3d0931d43..7b1efd612 100644 --- a/frame/2/syr2/bli_syr2.c +++ b/frame/2/syr2/bli_syr2.c @@ -34,10 +34,10 @@ #include "blis.h" -extern her2_t* her2_cntl_bs_ke_row; -extern her2_t* her2_cntl_bs_ke_col; -extern her2_t* her2_cntl_ge_row; -extern her2_t* her2_cntl_ge_col; +extern her2_t* her2_cntl_bs_ke_lrow_ucol; +extern her2_t* her2_cntl_bs_ke_lcol_urow; +extern her2_t* her2_cntl_ge_lrow_ucol; +extern her2_t* her2_cntl_ge_lcol_urow; void bli_syr2( obj_t* alpha, obj_t* x, @@ -86,10 +86,20 @@ void bli_syr2( obj_t* alpha, y_is_contig && c_is_contig ) { - // Use different control trees depending on storage of the matrix - // operand. - if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_row; - else her2_cntl = her2_cntl_bs_ke_col; + // We use two control trees to handle the four cases corresponding to + // combinations of upper/lower triangular storage and row/column-storage. + // The row-stored lower triangular and column-stored upper triangular + // trees are identical. Same for the remaining two trees. + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol; + else her2_cntl = her2_cntl_bs_ke_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow; + else her2_cntl = her2_cntl_bs_ke_lrow_ucol; + } } else { @@ -101,8 +111,16 @@ void bli_syr2( obj_t* alpha, // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. - if ( bli_obj_is_row_tilted( *c ) ) her2_cntl = her2_cntl_ge_row; - else her2_cntl = her2_cntl_ge_col; + if ( bli_obj_is_lower( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lrow_ucol; + else her2_cntl = her2_cntl_ge_lcol_urow; + } + else // if ( bli_obj_is_upper( *c ) ) + { + if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lcol_urow; + else her2_cntl = her2_cntl_ge_lrow_ucol; + } } diff --git a/frame/2/trmv/bli_trmv_int.c b/frame/2/trmv/bli_trmv_int.c index ee15a0265..fecaafe85 100644 --- a/frame/2/trmv/bli_trmv_int.c +++ b/frame/2/trmv/bli_trmv_int.c @@ -74,8 +74,7 @@ void bli_trmv_int( obj_t* alpha, if ( bli_error_checking_is_enabled() ) bli_trmv_int_check( alpha, a, x, cntl ); - // Return early if one of the operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; + // If x has a zero dimension, return early. if ( bli_obj_has_zero_dim( *x ) ) return; // Alias A in case we need to induce a transformation (ie: transposition). diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c index f027c39e4..a5a14a4a4 100644 --- a/frame/2/trmv/bli_trmv_unf_var1.c +++ b/frame/2/trmv/bli_trmv_unf_var1.c @@ -168,7 +168,7 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_ahead = m - iter - f; \ A11 = a_cast + (i )*rs_at + (i )*cs_at; \ @@ -223,12 +223,12 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ - i = m - iter; \ - n_ahead = i - f; \ - A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \ - A10 = a_cast + (i-f)*rs_at + (0 )*cs_at; \ - x1 = x_cast + (i-f)*incx; \ + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ + i = m - iter - f; \ + n_ahead = i; \ + A11 = a_cast + (i )*rs_at + (i )*cs_at; \ + A10 = a_cast + (i )*rs_at + (0 )*cs_at; \ + x1 = x_cast + (i )*incx; \ x0 = x_cast + (0 )*incx; \ \ /* x1 = alpha * A11 * x1; */ \ diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c index f4c8c65b9..93830bec6 100644 --- a/frame/2/trmv/bli_trmv_unf_var2.c +++ b/frame/2/trmv/bli_trmv_unf_var2.c @@ -167,7 +167,7 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_behind = i; \ A11 = a_cast + (i )*rs_at + (i )*cs_at; \ @@ -220,13 +220,13 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ - i = m - iter; \ + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ + i = m - iter - f; \ n_behind = iter; \ - A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \ - A21 = a_cast + (i )*rs_at + (i-f)*cs_at; \ - x1 = x_cast + (i-f)*incx; \ - x2 = x_cast + (i )*incx; \ + A11 = a_cast + (i )*rs_at + (i )*cs_at; \ + A21 = a_cast + (i+f)*rs_at + (i )*cs_at; \ + x1 = x_cast + (i )*incx; \ + x2 = x_cast + (i+f)*incx; \ \ /* x2 = x2 + alpha * A21 * x1; */ \ PASTEMAC3(cha,chx,chx,kername)( conja, \ diff --git a/frame/2/trsv/bli_trsv_int.c b/frame/2/trsv/bli_trsv_int.c index 71ff52b1f..c55d7af50 100644 --- a/frame/2/trsv/bli_trsv_int.c +++ b/frame/2/trsv/bli_trsv_int.c @@ -74,8 +74,7 @@ void bli_trsv_int( obj_t* alpha, if ( bli_error_checking_is_enabled() ) bli_trsv_int_check( alpha, a, x, cntl ); - // Return early if one of the operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; + // If x has a zero dimension, return early. if ( bli_obj_has_zero_dim( *x ) ) return; // Alias A in case we need to induce a transformation (ie: transposition). diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c index 7e1822d43..525758a9d 100644 --- a/frame/2/trsv/bli_trsv_unf_var1.c +++ b/frame/2/trsv/bli_trsv_unf_var1.c @@ -175,13 +175,13 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ - i = m - iter; \ + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ + i = m - iter - f; \ n_behind = iter; \ - A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \ - A12 = a_cast + (i-f)*rs_at + (i )*cs_at; \ - x1 = x_cast + (i-f)*incx; \ - x2 = x_cast + (i )*incx; \ + A11 = a_cast + (i )*rs_at + (i )*cs_at; \ + A12 = a_cast + (i )*rs_at + (i+f)*cs_at; \ + x1 = x_cast + (i )*incx; \ + x2 = x_cast + (i+f)*incx; \ \ /* x1 = x1 - A12 * x2; */ \ PASTEMAC3(cha,chx,chx,kername)( conja, \ @@ -231,7 +231,7 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_behind = i; \ A11 = a_cast + (i )*rs_at + (i )*cs_at; \ diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c index f4659c3e3..145f6af66 100644 --- a/frame/2/trsv/bli_trsv_unf_var2.c +++ b/frame/2/trsv/bli_trsv_unf_var2.c @@ -174,12 +174,12 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ - i = m - iter; \ - n_ahead = i - f; \ - A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \ - A01 = a_cast + (0 )*rs_at + (i-f)*cs_at; \ - x1 = x_cast + (i-f)*incx; \ + f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ + i = m - iter - f; \ + n_ahead = i; \ + A11 = a_cast + (i )*rs_at + (i )*cs_at; \ + A01 = a_cast + (0 )*rs_at + (i )*cs_at; \ + x1 = x_cast + (i )*incx; \ x0 = x_cast + (0 )*incx; \ \ /* x1 = x1 / triu( A11 ); */ \ @@ -228,7 +228,7 @@ void PASTEMAC2(cha,chx,varname)( \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ - f = bli_min( m - iter, b_fuse ); \ + f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_ahead = m - iter - f; \ A11 = a_cast + (i )*rs_at + (i )*cs_at; \ diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index eb947b98a..1b7d726de 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -70,11 +70,17 @@ void bli_gemm_int( obj_t* alpha, if ( bli_error_checking_is_enabled() ) bli_gemm_int_check( alpha, a, b, beta, c, cntl ); - // Return early if one of the matrix operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; - if ( bli_obj_has_zero_dim( *b ) ) return; + // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; + // If A or B has a zero dimension, scale C by beta and return early. + if ( bli_obj_has_zero_dim( *a ) || + bli_obj_has_zero_dim( *b ) ) + { + bli_scalm( beta, c ); + return; + } + // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); diff --git a/frame/3/her2k/bli_her2k_int.c b/frame/3/her2k/bli_her2k_int.c index d41f90380..2362bbfc6 100644 --- a/frame/3/her2k/bli_her2k_int.c +++ b/frame/3/her2k/bli_her2k_int.c @@ -88,12 +88,18 @@ void bli_her2k_int( obj_t* alpha, if ( bli_error_checking_is_enabled() ) bli_her2k_int_check( alpha, a, bh, alpha_conj, b, ah, beta, c, cntl ); - // Return early if one of the matrix operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; - if ( bli_obj_has_zero_dim( *bh ) ) return; - if ( bli_obj_has_zero_dim( *b ) ) return; - if ( bli_obj_has_zero_dim( *ah ) ) return; - if ( bli_obj_has_zero_dim( *c ) ) return; + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( *c ) ) return; + + // If A or B has a zero dimension, scale C by beta and return early. + if ( bli_obj_has_zero_dim( *a ) || + bli_obj_has_zero_dim( *ah ) || + bli_obj_has_zero_dim( *b ) || + bli_obj_has_zero_dim( *bh ) ) + { + bli_scalm( beta, c ); + return; + } // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index b9a937ea8..85a09ce34 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -82,10 +82,16 @@ void bli_herk_int( obj_t* alpha, if ( bli_error_checking_is_enabled() ) bli_herk_int_check( alpha, a, ah, beta, c, cntl ); - // Return early if one of the matrix operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; - if ( bli_obj_has_zero_dim( *ah ) ) return; - if ( bli_obj_has_zero_dim( *c ) ) return; + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( *c ) ) return; + + // If A or A' has a zero dimension, scale C by beta and return early. + if ( bli_obj_has_zero_dim( *a ) || + bli_obj_has_zero_dim( *ah ) ) + { + bli_scalm( beta, c ); + return; + } // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index 6739af7ed..79f64cdf2 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -83,11 +83,17 @@ void bli_trmm_int( side_t side, if ( bli_error_checking_is_enabled() ) bli_trmm_int_check( side, alpha, a, b, beta, c, cntl ); - // Return early if one of the matrix operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; - if ( bli_obj_has_zero_dim( *b ) ) return; + // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; + // If A or B has a zero dimension, scale C by beta and return early. + if ( bli_obj_has_zero_dim( *a ) || + bli_obj_has_zero_dim( *b ) ) + { + bli_scalm( beta, c ); + return; + } + // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index 6eaaa3952..50158d5d5 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -83,11 +83,17 @@ void bli_trsm_int( side_t side, if ( bli_error_checking_is_enabled() ) bli_trsm_int_check( side, alpha, a, b, beta, c, cntl ); - // Return early if one of the matrix operands has a zero dimension. - if ( bli_obj_has_zero_dim( *a ) ) return; - if ( bli_obj_has_zero_dim( *b ) ) return; + // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; + // If A or B has a zero dimension, scale C by beta and return early. + if ( bli_obj_has_zero_dim( *a ) || + bli_obj_has_zero_dim( *b ) ) + { + bli_scalm( beta, c ); + return; + } + // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 4b002ce0b..9806110c2 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -485,8 +485,10 @@ bli_obj_width_stored( obj ) #define bli_obj_vector_inc( x ) \ \ + ( bli_obj_is_scalar( x ) ? 1 : \ ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) \ - : bli_obj_row_stride( x ) ) + : bli_obj_row_stride( x ) ) \ + ) #define bli_obj_is_vector( x ) \ \ @@ -506,6 +508,11 @@ bli_obj_width_stored( obj ) ( bli_obj_length( obj ) == 0 || \ bli_obj_width( obj ) == 0 ) +#define bli_obj_is_scalar( x ) \ +\ + ( bli_obj_length( x ) == 1 && \ + bli_obj_width( x ) == 1 ) + // Dimension modification diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 3e24a12c5..aecdb384c 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -314,6 +314,19 @@ else { mt = n; nt = m; rst = cs; cst = rs; } \ } + +// blocksize-related + +#define bli_determine_blocksize_dim_f( i, dim, b_alg ) \ +\ + ( bli_min( b_alg, dim - i ) ) + +#define bli_determine_blocksize_dim_b( i, dim, b_alg ) \ +\ + ( i == 0 && dim % b_alg != 0 ? dim % b_alg \ + : b_alg ) + + // stride-related #define bli_vector_inc( trans, m, n, rs, cs ) \ diff --git a/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c b/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c index 1fde94931..07269de39 100644 --- a/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c +++ b/kernels/x86/3/bli_gemmtrsm_l_opt_d4x2.c @@ -272,13 +272,12 @@ void bli_dgemmtrsm_l_opt_d4x2( "movl %10, %%eax \n\t" // load address of alpha "movddup (%%eax), %%xmm7 \n\t" // load alpha and duplicate " \n\t" - "movapd 0 * 16(%%ebx), %%xmm4 \n\t" // load xmm4 = ( beta00 beta01 ) - "movapd 1 * 16(%%ebx), %%xmm5 \n\t" // load xmm5 = ( beta10 beta11 ) - "movapd 2 * 16(%%ebx), %%xmm6 \n\t" // load xmm6 = ( beta20 beta21 ) - "mulpd %%xmm7, %%xmm4 \n\t" // xmm4 *= alpha - "mulpd %%xmm7, %%xmm5 \n\t" // xmm5 *= alpha - "mulpd %%xmm7, %%xmm6 \n\t" // xmm6 *= alpha - //"movapd 3 * 16(%%ebx), %%xmm7 \n\t" // load xmm7 = ( beta30 beta31 ) + "movapd 0 * 16(%%ebx), %%xmm4 \n\t" + "movapd 1 * 16(%%ebx), %%xmm5 \n\t" + "mulpd %%xmm7, %%xmm4 \n\t" // xmm4 = alpha * ( beta00 beta01 ) + "mulpd %%xmm7, %%xmm5 \n\t" // xmm5 = alpha * ( beta10 beta11 ) + "movapd 2 * 16(%%ebx), %%xmm6 \n\t" + "mulpd %%xmm7, %%xmm6 \n\t" // xmm6 = alpha * ( beta20 beta21 ) "mulpd 3 * 16(%%ebx), %%xmm7 \n\t" // xmm7 = alpha * ( beta30 beta31 ) " \n\t" "subpd %%xmm0, %%xmm4 \n\t" // xmm4 -= xmm0 diff --git a/kernels/x86_64/1/bli_dotv_opt_var1.c b/kernels/x86_64/1/bli_dotv_opt_var1.c index 1e055cc72..b83d3954f 100644 --- a/kernels/x86_64/1/bli_dotv_opt_var1.c +++ b/kernels/x86_64/1/bli_dotv_opt_var1.c @@ -117,11 +117,11 @@ void PASTEMAC3(chx,chy,chr,varname)( \ \ if ( bli_zero_dim1( n ) ) \ { \ - PASTEMAC(chr,set0)( *rho_cast ); \ + PASTEMAC(chr,set0s)( *rho_cast ); \ return; \ } \ \ - PASTEMAC(chr,set0)( dotxy ); \ + PASTEMAC(chr,set0s)( dotxy ); \ \ chi1 = x_cast; \ psi1 = y_cast; \ @@ -216,7 +216,7 @@ void bli_ddddotv_opt_var1( if ( bli_zero_dim1( n ) ) { - PASTEMAC(d,set0)( *rho_cast ); + PASTEMAC(d,set0s)( *rho_cast ); return; } @@ -238,7 +238,7 @@ void bli_ddddotv_opt_var1( x1 = x_cast; y1 = y_cast; - PASTEMAC(d,set0)( rho1 ); + PASTEMAC(d,set0s)( rho1 ); if ( n_pre == 1 ) { diff --git a/kernels/x86_64/1f/bli_dotaxpyv_opt_var1.c b/kernels/x86_64/1f/bli_dotaxpyv_opt_var1.c index de9e32252..6503a4f7e 100644 --- a/kernels/x86_64/1f/bli_dotaxpyv_opt_var1.c +++ b/kernels/x86_64/1f/bli_dotaxpyv_opt_var1.c @@ -133,7 +133,7 @@ void bli_ddddotaxpyv_opt_var1( if ( bli_zero_dim1( n ) ) { - PASTEMAC(d,set0)( *rho_cast ); + PASTEMAC(d,set0s)( *rho_cast ); return; } @@ -153,7 +153,7 @@ void bli_ddddotaxpyv_opt_var1( stepy = 2 * incy; stepz = 2 * incz; - PASTEMAC(d,set0)( rho1c ); + PASTEMAC(d,set0s)( rho1c ); alpha1c = *alpha_cast; diff --git a/kernels/x86_64/1f/bli_dotxaxpyf_opt_var1.c b/kernels/x86_64/1f/bli_dotxaxpyf_opt_var1.c index ca978d7ed..b6ac67f8d 100644 --- a/kernels/x86_64/1f/bli_dotxaxpyf_opt_var1.c +++ b/kernels/x86_64/1f/bli_dotxaxpyf_opt_var1.c @@ -233,11 +233,11 @@ void bli_ddddotxaxpyf_opt_var1( PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); - PASTEMAC(d,set0)( rho0 ); - PASTEMAC(d,set0)( rho1 ); - PASTEMAC(d,set0)( rho2 ); - PASTEMAC(d,set0)( rho3 ); - PASTEMAC(d,set0)( zeta1 ); + PASTEMAC(d,set0s)( rho0 ); + PASTEMAC(d,set0s)( rho1 ); + PASTEMAC(d,set0s)( rho2 ); + PASTEMAC(d,set0s)( rho3 ); + PASTEMAC(d,set0s)( zeta1 ); if ( m_pre == 1 ) { diff --git a/kernels/x86_64/1f/bli_dotxf_opt_var1.c b/kernels/x86_64/1f/bli_dotxf_opt_var1.c index ff943309c..f9dcff631 100644 --- a/kernels/x86_64/1f/bli_dotxf_opt_var1.c +++ b/kernels/x86_64/1f/bli_dotxf_opt_var1.c @@ -267,10 +267,10 @@ void bli_ddddotxf_opt_var1( x3 = x_cast + 3*ldx; y0 = y_cast; - PASTEMAC(d,set0)( rho0 ); - PASTEMAC(d,set0)( rho1 ); - PASTEMAC(d,set0)( rho2 ); - PASTEMAC(d,set0)( rho3 ); + PASTEMAC(d,set0s)( rho0 ); + PASTEMAC(d,set0s)( rho1 ); + PASTEMAC(d,set0s)( rho2 ); + PASTEMAC(d,set0s)( rho3 ); if ( n_pre == 1 ) { diff --git a/kernels/x86_64/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/3/bli_gemm_opt_d4x4.c index 5bd4bf5b8..fdce624c7 100644 --- a/kernels/x86_64/3/bli_gemm_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemm_opt_d4x4.c @@ -281,8 +281,8 @@ void bli_dgemm_opt_d4x4( "movaps -5 * 16(%%rax), %%xmm1 \n\t" " \n\t" " \n\t" - "addq $4 * 4 * 8, %%rax \n\t" // a += 4 (1 x mr) - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4 (1 x nr) + "addq $4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr) + "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; diff --git a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c index fc1f3e389..2aadc1c88 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.c @@ -36,6 +36,7 @@ void bli_sgemmtrsm_l_opt_d4x4( dim_t k, + float* restrict alpha, float* restrict a10, float* restrict a11, float* restrict bd01, @@ -51,6 +52,7 @@ void bli_sgemmtrsm_l_opt_d4x4( void bli_dgemmtrsm_l_opt_d4x4( dim_t k, + double* restrict alpha, double* restrict a10, double* restrict a11, double* restrict bd01, @@ -334,14 +336,26 @@ void bli_dgemmtrsm_l_opt_d4x4( " \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) " \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) " \n\t" - "movapd 0 * 16(%%rbx), %%xmm8 \n\t" + "movq %10, %%rax \n\t" // load address of alpha + "movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate + " \n\t" + "movapd 0 * 16(%%rbx), %%xmm8 \n\t" "movapd 1 * 16(%%rbx), %%xmm12 \n\t" + "mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 ) + "mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 ) "movapd 2 * 16(%%rbx), %%xmm9 \n\t" "movapd 3 * 16(%%rbx), %%xmm13 \n\t" + "mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 ) + "mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 ) "movapd 4 * 16(%%rbx), %%xmm10 \n\t" "movapd 5 * 16(%%rbx), %%xmm14 \n\t" + "mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 ) + "mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 ) "movapd 6 * 16(%%rbx), %%xmm11 \n\t" - "movapd 7 * 16(%%rbx), %%xmm15 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 ) + "mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 ) + " \n\t" + " \n\t" // (Now scaled by alpha:) " \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) " \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) " \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) @@ -491,7 +505,8 @@ void bli_dgemmtrsm_l_opt_d4x4( "m" (b11), "m" (c11), "m" (rs_c), - "m" (cs_c) + "m" (cs_c), + "m" (alpha) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "xmm0", "xmm1", "xmm2", "xmm3", @@ -505,6 +520,7 @@ void bli_dgemmtrsm_l_opt_d4x4( void bli_cgemmtrsm_l_opt_d4x4( dim_t k, + scomplex* restrict alpha, scomplex* restrict a10, scomplex* restrict a11, scomplex* restrict bd01, @@ -520,6 +536,7 @@ void bli_cgemmtrsm_l_opt_d4x4( void bli_zgemmtrsm_l_opt_d4x4( dim_t k, + dcomplex* restrict alpha, dcomplex* restrict a10, dcomplex* restrict a11, dcomplex* restrict bd01, diff --git a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h index 11962c88b..29fffc5e9 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h +++ b/kernels/x86_64/3/bli_gemmtrsm_l_opt_d4x4.h @@ -38,6 +38,7 @@ \ void PASTEMAC(ch,varname)( \ dim_t k, \ + ctype* restrict alpha, \ ctype* restrict a10, \ ctype* restrict a11, \ ctype* restrict bd01, \ diff --git a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c index 469964f45..59469b07f 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c +++ b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.c @@ -36,6 +36,7 @@ void bli_sgemmtrsm_u_opt_d4x4( dim_t k, + float* restrict alpha, float* restrict a12, float* restrict a11, float* restrict bd21, @@ -51,6 +52,7 @@ void bli_sgemmtrsm_u_opt_d4x4( void bli_dgemmtrsm_u_opt_d4x4( dim_t k, + double* restrict alpha, double* restrict a12, double* restrict a11, double* restrict bd21, @@ -282,8 +284,8 @@ void bli_dgemmtrsm_u_opt_d4x4( "movaps -5 * 16(%%rax), %%xmm1 \n\t" " \n\t" " \n\t" - "addq $4 * 4 * 8, %%rax \n\t" // a += 4 (1 x mr) - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4 (1 x nr) + "addq $4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr) + "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; @@ -334,14 +336,26 @@ void bli_dgemmtrsm_u_opt_d4x4( " \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) " \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) " \n\t" + "movq %10, %%rax \n\t" // load address of alpha + "movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate + " \n\t" "movapd 0 * 16(%%rbx), %%xmm8 \n\t" "movapd 1 * 16(%%rbx), %%xmm12 \n\t" + "mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 ) + "mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 ) "movapd 2 * 16(%%rbx), %%xmm9 \n\t" "movapd 3 * 16(%%rbx), %%xmm13 \n\t" + "mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 ) + "mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 ) "movapd 4 * 16(%%rbx), %%xmm10 \n\t" "movapd 5 * 16(%%rbx), %%xmm14 \n\t" + "mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 ) + "mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 ) "movapd 6 * 16(%%rbx), %%xmm11 \n\t" - "movapd 7 * 16(%%rbx), %%xmm15 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 ) + "mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 ) + " \n\t" + " \n\t" // (Now scaled by alpha:) " \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) " \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) " \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) @@ -494,7 +508,8 @@ void bli_dgemmtrsm_u_opt_d4x4( "m" (b11), "m" (c11), "m" (rs_c), - "m" (cs_c) + "m" (cs_c), + "m" (alpha) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "xmm0", "xmm1", "xmm2", "xmm3", @@ -508,6 +523,7 @@ void bli_dgemmtrsm_u_opt_d4x4( void bli_cgemmtrsm_u_opt_d4x4( dim_t k, + scomplex* restrict alpha, scomplex* restrict a12, scomplex* restrict a11, scomplex* restrict bd21, @@ -523,6 +539,7 @@ void bli_cgemmtrsm_u_opt_d4x4( void bli_zgemmtrsm_u_opt_d4x4( dim_t k, + dcomplex* restrict alpha, dcomplex* restrict a12, dcomplex* restrict a11, dcomplex* restrict bd21, diff --git a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h index c88a696a2..8ea91dd65 100644 --- a/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h +++ b/kernels/x86_64/3/bli_gemmtrsm_u_opt_d4x4.h @@ -38,6 +38,7 @@ \ void PASTEMAC(ch,varname)( \ dim_t k, \ + ctype* restrict alpha, \ ctype* restrict a12, \ ctype* restrict a11, \ ctype* restrict bd21, \ diff --git a/testsuite/input.operations b/testsuite/input.operations index 9449fd0f6..8327670b9 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -158,7 +158,7 @@ 1 gemm (0 = disable all; 1 = specify) 1 test sequential front-end (0 = disable; 1 = enable) --1 -3 -2 dimensions: m n k (-1 = bind to problem size) +-1 -2 -3 dimensions: m n k (-1 = bind to problem size) ?? parameters: transa transb (? = test all values) 1 hemm (0 = disable all; 1 = specify) diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c index cec2cc41a..3c66e9182 100644 --- a/testsuite/src/test_addm.c +++ b/testsuite/src/test_addm.c @@ -170,6 +170,9 @@ void libblis_test_addm_experiment( test_params_t* params, // Perform checks. libblis_test_addm_check( &alpha, &beta, &x, &y, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_addv.c b/testsuite/src/test_addv.c index 967846e59..b87d9f035 100644 --- a/testsuite/src/test_addv.c +++ b/testsuite/src/test_addv.c @@ -166,6 +166,9 @@ void libblis_test_addv_experiment( test_params_t* params, // Perform checks. libblis_test_addv_check( &alpha, &beta, &x, &y, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c index fd82ef369..cef8e9325 100644 --- a/testsuite/src/test_axpym.c +++ b/testsuite/src/test_axpym.c @@ -184,6 +184,9 @@ void libblis_test_axpym_experiment( test_params_t* params, // Perform checks. libblis_test_axpym_check( &alpha, &x, &y, &y_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c index 596a25382..5bbf91fc1 100644 --- a/testsuite/src/test_axpyv.c +++ b/testsuite/src/test_axpyv.c @@ -182,6 +182,9 @@ void libblis_test_axpyv_experiment( test_params_t* params, // Perform checks. libblis_test_axpyv_check( &alpha, &x, &y, &y_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_copym.c b/testsuite/src/test_copym.c index 0d4793329..b79704b32 100644 --- a/testsuite/src/test_copym.c +++ b/testsuite/src/test_copym.c @@ -160,6 +160,9 @@ void libblis_test_copym_experiment( test_params_t* params, // Perform checks. libblis_test_copym_check( &x, &y, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_copyv.c b/testsuite/src/test_copyv.c index 7c19e5b42..a6e16b5b6 100644 --- a/testsuite/src/test_copyv.c +++ b/testsuite/src/test_copyv.c @@ -157,6 +157,9 @@ void libblis_test_copyv_experiment( test_params_t* params, // Perform checks. libblis_test_copyv_check( &x, &y, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c index 1585473eb..ec7607bc9 100644 --- a/testsuite/src/test_dotv.c +++ b/testsuite/src/test_dotv.c @@ -181,6 +181,9 @@ void libblis_test_dotv_experiment( test_params_t* params, // Perform checks. libblis_test_dotv_check( &x, &y, &rho, resid ); + // Zero out performance and residual if output scalar is empty. + libblis_test_check_empty_problem( &rho, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c index df4a3aa59..59a447ee7 100644 --- a/testsuite/src/test_dotxv.c +++ b/testsuite/src/test_dotxv.c @@ -196,6 +196,9 @@ void libblis_test_dotxv_experiment( test_params_t* params, // Perform checks. libblis_test_dotxv_check( &alpha, &x, &y, &beta, &rho, &rho_save, resid ); + // Zero out performance and residual if output scalar is empty. + libblis_test_check_empty_problem( &rho, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_fnormm.c b/testsuite/src/test_fnormm.c index 927f9ee79..bc9ca48f1 100644 --- a/testsuite/src/test_fnormm.c +++ b/testsuite/src/test_fnormm.c @@ -164,6 +164,9 @@ void libblis_test_fnormm_experiment( test_params_t* params, // Perform checks. libblis_test_fnormm_check( &beta, &x, &norm, resid ); + // Zero out performance and residual if input matrix is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &x ); } diff --git a/testsuite/src/test_fnormv.c b/testsuite/src/test_fnormv.c index 17c83e159..95e4c4f01 100644 --- a/testsuite/src/test_fnormv.c +++ b/testsuite/src/test_fnormv.c @@ -162,6 +162,9 @@ void libblis_test_fnormv_experiment( test_params_t* params, // Perform checks. libblis_test_fnormv_check( &beta, &x, &norm, resid ); + // Zero out performance and residual if input vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &x ); } diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index e97242617..af7be24b8 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -212,6 +212,9 @@ void libblis_test_gemm_experiment( test_params_t* params, // Perform checks. libblis_test_gemm_check( &alpha, &a, &b, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c index 5a865bb3e..f9e53e3f4 100644 --- a/testsuite/src/test_gemv.c +++ b/testsuite/src/test_gemv.c @@ -207,6 +207,9 @@ void libblis_test_gemv_experiment( test_params_t* params, // Perform checks. libblis_test_gemv_check( &kappa, &alpha, &a, &x, &beta, &y, &y_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &x ); diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c index a0a73bfcb..316b4b1ca 100644 --- a/testsuite/src/test_ger.c +++ b/testsuite/src/test_ger.c @@ -194,6 +194,9 @@ void libblis_test_ger_experiment( test_params_t* params, // Perform checks. libblis_test_ger_check( &alpha, &x, &y, &a, &a_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &a, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c index 77952e9d2..5a3f3a0ee 100644 --- a/testsuite/src/test_hemm.c +++ b/testsuite/src/test_hemm.c @@ -228,6 +228,9 @@ void libblis_test_hemm_experiment( test_params_t* params, // Perform checks. libblis_test_hemm_check( side, &alpha, &a, &b, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c index 107905305..708e49fc5 100644 --- a/testsuite/src/test_hemv.c +++ b/testsuite/src/test_hemv.c @@ -220,6 +220,9 @@ void libblis_test_hemv_experiment( test_params_t* params, // Perform checks. libblis_test_hemv_check( &alpha, &a, &x, &beta, &y, &y_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &x ); diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c index fd248fb7e..3e39e69e0 100644 --- a/testsuite/src/test_her.c +++ b/testsuite/src/test_her.c @@ -198,6 +198,9 @@ void libblis_test_her_experiment( test_params_t* params, // Perform checks. libblis_test_her_check( &alpha, &x, &a, &a_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &a, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &a ); diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c index 39f5300cb..4778a42c5 100644 --- a/testsuite/src/test_her2.c +++ b/testsuite/src/test_her2.c @@ -205,6 +205,9 @@ void libblis_test_her2_experiment( test_params_t* params, // Perform checks. libblis_test_her2_check( &alpha, &x, &y, &a, &a_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &a, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c index 2e8ec19f3..582c578e9 100644 --- a/testsuite/src/test_her2k.c +++ b/testsuite/src/test_her2k.c @@ -228,6 +228,9 @@ void libblis_test_her2k_experiment( test_params_t* params, // Perform checks. libblis_test_her2k_check( &alpha, &a, &b, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c index 7e35fdc61..f939dca72 100644 --- a/testsuite/src/test_herk.c +++ b/testsuite/src/test_herk.c @@ -220,6 +220,9 @@ void libblis_test_herk_experiment( test_params_t* params, // Perform checks. libblis_test_herk_check( &alpha, &a, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &c ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 175feb2f0..d001d42bb 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -1614,3 +1614,12 @@ void libblis_test_parse_command_line( int argc, char** argv ) +void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid ) +{ + if ( bli_obj_has_zero_dim( *c ) ) + { + *perf = 0.0; + *resid = 0.0; + } +} + diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 0158564a1..dfd576584 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -354,6 +354,10 @@ void libblis_test_printf_error( char* message, ... ); void libblis_test_parse_message( FILE* output_stream, char* message, va_list args ); void libblis_test_parse_command_line( int argc, char** argv ); +// --- Miscellaneous --- + +void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid ); + // // --- Test module headers ----------------------------------------------------- diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c index ac76ee4c1..acebc5a19 100644 --- a/testsuite/src/test_randm.c +++ b/testsuite/src/test_randm.c @@ -155,6 +155,9 @@ void libblis_test_randm_experiment( test_params_t* params, // we consider to be likely. libblis_test_randm_check( &x, resid ); + // Zero out performance and residual if input matrix is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &x ); } diff --git a/testsuite/src/test_randv.c b/testsuite/src/test_randv.c index 75dbfa7d2..784aeac55 100644 --- a/testsuite/src/test_randv.c +++ b/testsuite/src/test_randv.c @@ -155,6 +155,9 @@ void libblis_test_randv_experiment( test_params_t* params, // we consider to be likely. libblis_test_randv_check( &x, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &x ); } diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c index bb39532a9..942cf53ab 100644 --- a/testsuite/src/test_scal2m.c +++ b/testsuite/src/test_scal2m.c @@ -183,6 +183,9 @@ void libblis_test_scal2m_experiment( test_params_t* params, // Perform checks. libblis_test_scal2m_check( &alpha, &x, &y, &y_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c index 388d3a3f1..8f07e5588 100644 --- a/testsuite/src/test_scal2v.c +++ b/testsuite/src/test_scal2v.c @@ -181,6 +181,9 @@ void libblis_test_scal2v_experiment( test_params_t* params, // Perform checks. libblis_test_scal2v_check( &alpha, &x, &y, &y_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c index 3f9083f72..7c581f476 100644 --- a/testsuite/src/test_scalm.c +++ b/testsuite/src/test_scalm.c @@ -177,6 +177,9 @@ void libblis_test_scalm_experiment( test_params_t* params, // Perform checks. libblis_test_scalm_check( &beta, &y, &y_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &y ); bli_obj_free( &y_save ); diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c index 924b6a8cf..8b7aa1fa0 100644 --- a/testsuite/src/test_scalv.c +++ b/testsuite/src/test_scalv.c @@ -174,6 +174,9 @@ void libblis_test_scalv_experiment( test_params_t* params, // Perform checks. libblis_test_scalv_check( &beta, &y, &y_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &y ); bli_obj_free( &y_save ); diff --git a/testsuite/src/test_setm.c b/testsuite/src/test_setm.c index e28a7e946..3d9f385a6 100644 --- a/testsuite/src/test_setm.c +++ b/testsuite/src/test_setm.c @@ -160,6 +160,9 @@ void libblis_test_setm_experiment( test_params_t* params, // Perform checks. libblis_test_setm_check( &beta, &x, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &x ); } diff --git a/testsuite/src/test_setv.c b/testsuite/src/test_setv.c index e2b31b9cb..b22538329 100644 --- a/testsuite/src/test_setv.c +++ b/testsuite/src/test_setv.c @@ -158,6 +158,9 @@ void libblis_test_setv_experiment( test_params_t* params, // Perform checks. libblis_test_setv_check( &beta, &x, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &x ); } diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c index 727e4eca8..a26eb0e3a 100644 --- a/testsuite/src/test_subm.c +++ b/testsuite/src/test_subm.c @@ -170,6 +170,9 @@ void libblis_test_subm_experiment( test_params_t* params, // Perform checks. libblis_test_subm_check( &alpha, &beta, &x, &y, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_subv.c b/testsuite/src/test_subv.c index dd776071f..87157324b 100644 --- a/testsuite/src/test_subv.c +++ b/testsuite/src/test_subv.c @@ -167,6 +167,9 @@ void libblis_test_subv_experiment( test_params_t* params, // Perform checks. libblis_test_subv_check( &alpha, &beta, &x, &y, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c index 58bba03ed..a2e212670 100644 --- a/testsuite/src/test_symm.c +++ b/testsuite/src/test_symm.c @@ -228,6 +228,9 @@ void libblis_test_symm_experiment( test_params_t* params, // Perform checks. libblis_test_symm_check( side, &alpha, &a, &b, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c index 4d7a74d11..46638226c 100644 --- a/testsuite/src/test_symv.c +++ b/testsuite/src/test_symv.c @@ -220,6 +220,9 @@ void libblis_test_symv_experiment( test_params_t* params, // Perform checks. libblis_test_symv_check( &alpha, &a, &x, &beta, &y, &y_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &x ); diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c index bc43a8344..17a9fdd26 100644 --- a/testsuite/src/test_syr.c +++ b/testsuite/src/test_syr.c @@ -197,6 +197,9 @@ void libblis_test_syr_experiment( test_params_t* params, // Perform checks. libblis_test_syr_check( &alpha, &x, &a, &a_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &a, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &a ); diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c index 751d91caa..87ec76fb0 100644 --- a/testsuite/src/test_syr2.c +++ b/testsuite/src/test_syr2.c @@ -204,6 +204,9 @@ void libblis_test_syr2_experiment( test_params_t* params, // Perform checks. libblis_test_syr2_check( &alpha, &x, &y, &a, &a_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &a, perf, resid ); + // Free the test objects. bli_obj_free( &x ); bli_obj_free( &y ); diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c index 9d9c53ce7..613cee523 100644 --- a/testsuite/src/test_syr2k.c +++ b/testsuite/src/test_syr2k.c @@ -228,6 +228,9 @@ void libblis_test_syr2k_experiment( test_params_t* params, // Perform checks. libblis_test_syr2k_check( &alpha, &a, &b, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c index 4c258d38e..2a7ecd14d 100644 --- a/testsuite/src/test_syrk.c +++ b/testsuite/src/test_syrk.c @@ -220,6 +220,9 @@ void libblis_test_syrk_experiment( test_params_t* params, // Perform checks. libblis_test_syrk_check( &alpha, &a, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &c ); diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index b3a03f433..3e07a719b 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -218,6 +218,9 @@ void libblis_test_trmm_experiment( test_params_t* params, // Perform checks. libblis_test_trmm_check( side, &alpha, &a, &b, &b_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &b, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c index d3f91b42b..82a3c90b4 100644 --- a/testsuite/src/test_trmm3.c +++ b/testsuite/src/test_trmm3.c @@ -229,6 +229,9 @@ void libblis_test_trmm3_experiment( test_params_t* params, // Perform checks. libblis_test_trmm3_check( side, &alpha, &a, &b, &beta, &c, &c_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c index 3782daefe..169754c45 100644 --- a/testsuite/src/test_trmv.c +++ b/testsuite/src/test_trmv.c @@ -203,6 +203,9 @@ void libblis_test_trmv_experiment( test_params_t* params, // Perform checks. libblis_test_trmv_check( &alpha, &a, &x, &x_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &x ); diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c index 09b632f2c..26edbce2d 100644 --- a/testsuite/src/test_trsm.c +++ b/testsuite/src/test_trsm.c @@ -219,6 +219,9 @@ void libblis_test_trsm_experiment( test_params_t* params, // Perform checks. libblis_test_trsm_check( side, &alpha, &a, &b, &b_save, resid ); + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &b, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c index 1f82229c2..c5decf381 100644 --- a/testsuite/src/test_trsv.c +++ b/testsuite/src/test_trsv.c @@ -203,6 +203,9 @@ void libblis_test_trsv_experiment( test_params_t* params, // Perform checks. libblis_test_trsv_check( &alpha, &a, &x, &x_save, resid ); + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + // Free the test objects. bli_obj_free( &a ); bli_obj_free( &x );