mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Fixed x86_64 kernel bugs and other minor issues.
Details: - Fixed bugs in trmv_l and trsv_u due to backwards iteration resulting in unaligned subpartitions. We were already going out of our way a bit to handle edge cases in the first iteration for blocked variants, and this was simply the unblocked-fused extension of that idea. - Fixed control tree handling in her/her2/syr/syr2 that was not taking into account how the choice of variant needed to be altered for upper-stored matrices (given that only lower-stored algorithms are explicitly implemented). - Added bli_determine_blocksize_dim_f(), bli_determine_blocksize_dim_b() macros to provide inlined versions of bli_determine_blocksize_[fb]() for use by unblocked-fused variants. - Integrated new blocksize_dim macros into gemv/hemv unf variants for consistency with that of the bugfix for trmv/trsv (both of which now use the same macros). - Modified bli_obj_vector_inc() so that 1 is returned if the object is a vector of length 1 (ie: 1 x 1). This fixes a bug whereby under certain conditions (e.g. dotv_opt_var1), an invalid increment was returned, which was invalid only because the code was expecting 1 (for purposes of performing contiguous vector loads) but got a value greater than 1 because the column stride of the object (e.g. rho) was inflated for alignment purposes (albeit unnecessarily since there is only one element in the object). - Replaced some old invocations of set0 with set0s. - Added alpha parameter to gemmtrsm ukernels for x86_64 and use accordingly. - Fixed increment bug in cleanup loop of gemm ukernel for x86_64. - Added safeguard to test modules so that testing a problem with a zero dimension does not result in a failure. - Tweaked handling of zero dimensions in level-2 and level-3 operations' internal back-ends to correctly handle cases where output operand still needs to be scaled (e.g. by beta, in the case of gemm with k = 0).
This commit is contained in:
@@ -75,11 +75,17 @@ void bli_gemv_int( trans_t transa,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemv_int_check( alpha, &a_local, &x_local, beta, y, cntl );
|
||||
|
||||
// Return early if one of the operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
// If y has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *y ) ) return;
|
||||
|
||||
// If A or x has a zero dimension, scale y by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *x ) )
|
||||
{
|
||||
bli_scalm( beta, y );
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
|
||||
@@ -164,7 +164,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
\
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
f = bli_min( n_iter - i, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
|
||||
\
|
||||
A1 = a_cast + (i )*rs_at + (0 )*cs_at; \
|
||||
x1 = x_cast + (0 )*incy; \
|
||||
|
||||
@@ -182,7 +182,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
\
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
f = bli_min( n_iter - i, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
|
||||
\
|
||||
A1 = a_cast + (0 )*rs_at + (i )*cs_at; \
|
||||
x1 = x_cast + (i )*incx; \
|
||||
|
||||
@@ -71,11 +71,16 @@ void bli_ger_int( conj_t conjx,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_ger_int_check( alpha, x, y, a, cntl );
|
||||
|
||||
// Return early if one of the operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *y ) ) return;
|
||||
// If A has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
|
||||
// If x or y has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *x ) ||
|
||||
bli_obj_has_zero_dim( *y ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias the objects, applying conjx and conjy to x and y, respectively.
|
||||
bli_obj_alias_with_conj( conjx, *x, x_local );
|
||||
bli_obj_alias_with_conj( conjy, *y, y_local );
|
||||
|
||||
@@ -70,11 +70,17 @@ void bli_hemv_int( conj_t conjh,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_hemv_int_check( conjh, alpha, a, x, beta, y, cntl );
|
||||
|
||||
// Return early if one of the operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
// If y has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *y ) ) return;
|
||||
|
||||
// If A or x has a zero dimension, scale y by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *x ) )
|
||||
{
|
||||
bli_scalm( beta, y );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A in case we need to induce the upper triangular case.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
|
||||
|
||||
@@ -215,7 +215,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
\
|
||||
for ( i = 0; i < m; i += f ) \
|
||||
{ \
|
||||
f = bli_min( m - i, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
|
||||
n_behind = i; \
|
||||
A10 = a_cast + (i )*rs_at + (0 )*cs_at; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
|
||||
@@ -233,7 +233,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
\
|
||||
for ( i = 0; i < m; i += f ) \
|
||||
{ \
|
||||
f = bli_min( m - i, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
|
||||
n_ahead = m - i - f; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
A21 = a_cast + (i+f)*rs_at + (i )*cs_at; \
|
||||
|
||||
@@ -34,10 +34,10 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern her_t* her_cntl_bs_ke_row;
|
||||
extern her_t* her_cntl_bs_ke_col;
|
||||
extern her_t* her_cntl_ge_row;
|
||||
extern her_t* her_cntl_ge_col;
|
||||
extern her_t* her_cntl_bs_ke_lrow_ucol;
|
||||
extern her_t* her_cntl_bs_ke_lcol_urow;
|
||||
extern her_t* her_cntl_ge_lrow_ucol;
|
||||
extern her_t* her_cntl_ge_lcol_urow;
|
||||
|
||||
void bli_her( obj_t* alpha,
|
||||
obj_t* x,
|
||||
@@ -79,10 +79,20 @@ void bli_her( obj_t* alpha,
|
||||
if ( x_is_contig &&
|
||||
c_is_contig )
|
||||
{
|
||||
// Use different control trees depending on storage of the matrix
|
||||
// operand.
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_row;
|
||||
else her_cntl = her_cntl_bs_ke_col;
|
||||
// We use two control trees to handle the four cases corresponding to
|
||||
// combinations of upper/lower triangular storage and row/column-storage.
|
||||
// The row-stored lower triangular and column-stored upper triangular
|
||||
// trees are identical. Same for the remaining two trees.
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol;
|
||||
else her_cntl = her_cntl_bs_ke_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lcol_urow;
|
||||
else her_cntl = her_cntl_bs_ke_lrow_ucol;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -93,8 +103,16 @@ void bli_her( obj_t* alpha,
|
||||
|
||||
// Here, we make a similar choice as above, except that (1) we look
|
||||
// at storage tilt, and (2) we choose a tree that performs blocking.
|
||||
if ( bli_obj_is_row_tilted( *c ) ) her_cntl = her_cntl_ge_row;
|
||||
else her_cntl = her_cntl_ge_col;
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lrow_ucol;
|
||||
else her_cntl = her_cntl_ge_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lcol_urow;
|
||||
else her_cntl = her_cntl_ge_lrow_ucol;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -45,11 +45,11 @@ extern ger_t* ger_cntl_bs_ke_col;
|
||||
|
||||
static blksz_t* her_mc;
|
||||
|
||||
her_t* her_cntl_bs_ke_row;
|
||||
her_t* her_cntl_bs_ke_col;
|
||||
her_t* her_cntl_bs_ke_lrow_ucol;
|
||||
her_t* her_cntl_bs_ke_lcol_urow;
|
||||
|
||||
her_t* her_cntl_ge_row;
|
||||
her_t* her_cntl_ge_col;
|
||||
her_t* her_cntl_ge_lrow_ucol;
|
||||
her_t* her_cntl_ge_lcol_urow;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
@@ -71,13 +71,13 @@ void bli_her_cntl_init()
|
||||
|
||||
// Create control trees for the lowest-level kernels. These trees induce
|
||||
// operations on (persumably) relatively small block-subvector problems.
|
||||
her_cntl_bs_ke_row
|
||||
her_cntl_bs_ke_lrow_ucol
|
||||
=
|
||||
bli_her_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
NULL, NULL, NULL,
|
||||
NULL, NULL, NULL );
|
||||
her_cntl_bs_ke_col
|
||||
her_cntl_bs_ke_lcol_urow
|
||||
=
|
||||
bli_her_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
@@ -88,34 +88,34 @@ void bli_her_cntl_init()
|
||||
// Create control trees for generally large problems. Here, we choose
|
||||
// variants that partition for ger subproblems in the same direction
|
||||
// as the assumed storage.
|
||||
her_cntl_ge_row
|
||||
her_cntl_ge_lrow_ucol
|
||||
=
|
||||
bli_her_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1, // use var1 for row storage
|
||||
BLIS_VARIANT1,
|
||||
her_mc,
|
||||
packv_cntl, // pack x1 (if needed)
|
||||
NULL, // do NOT pack C11
|
||||
ger_cntl_rp_bs_row,
|
||||
her_cntl_bs_ke_row,
|
||||
her_cntl_bs_ke_lrow_ucol,
|
||||
NULL ); // no unpacking needed
|
||||
her_cntl_ge_col
|
||||
her_cntl_ge_lcol_urow
|
||||
=
|
||||
bli_her_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2, // use var2 for col storage
|
||||
BLIS_VARIANT2,
|
||||
her_mc,
|
||||
packv_cntl, // pack x1 (if needed)
|
||||
NULL, // do NOT pack C11
|
||||
ger_cntl_cp_bs_col,
|
||||
her_cntl_bs_ke_col,
|
||||
her_cntl_bs_ke_lcol_urow,
|
||||
NULL ); // no unpacking needed
|
||||
}
|
||||
|
||||
void bli_her_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( her_cntl_bs_ke_row );
|
||||
bli_cntl_obj_free( her_cntl_bs_ke_col );
|
||||
bli_cntl_obj_free( her_cntl_ge_row );
|
||||
bli_cntl_obj_free( her_cntl_ge_col );
|
||||
bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_obj_free( her_cntl_ge_lrow_ucol );
|
||||
bli_cntl_obj_free( her_cntl_ge_lcol_urow );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -67,9 +67,9 @@ void bli_her_int( conj_t conjh,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_her_int_check( conjh, alpha, x, c, cntl );
|
||||
|
||||
// Return early if one of the operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
// If C or x has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
|
||||
// Alias the operands in case we need to apply conjugations.
|
||||
bli_obj_alias_to( *x, x_local );
|
||||
|
||||
@@ -34,10 +34,10 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern her2_t* her2_cntl_bs_ke_row;
|
||||
extern her2_t* her2_cntl_bs_ke_col;
|
||||
extern her2_t* her2_cntl_ge_row;
|
||||
extern her2_t* her2_cntl_ge_col;
|
||||
extern her2_t* her2_cntl_bs_ke_lrow_ucol;
|
||||
extern her2_t* her2_cntl_bs_ke_lcol_urow;
|
||||
extern her2_t* her2_cntl_ge_lrow_ucol;
|
||||
extern her2_t* her2_cntl_ge_lcol_urow;
|
||||
|
||||
void bli_her2( obj_t* alpha,
|
||||
obj_t* x,
|
||||
@@ -93,10 +93,20 @@ void bli_her2( obj_t* alpha,
|
||||
y_is_contig &&
|
||||
c_is_contig )
|
||||
{
|
||||
// Use different control trees depending on storage of the matrix
|
||||
// operand.
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_row;
|
||||
else her2_cntl = her2_cntl_bs_ke_col;
|
||||
// We use two control trees to handle the four cases corresponding to
|
||||
// combinations of upper/lower triangular storage and row/column-storage.
|
||||
// The row-stored lower triangular and column-stored upper triangular
|
||||
// trees are identical. Same for the remaining two trees.
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol;
|
||||
else her2_cntl = her2_cntl_bs_ke_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow;
|
||||
else her2_cntl = her2_cntl_bs_ke_lrow_ucol;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -108,8 +118,16 @@ void bli_her2( obj_t* alpha,
|
||||
|
||||
// Here, we make a similar choice as above, except that (1) we look
|
||||
// at storage tilt, and (2) we choose a tree that performs blocking.
|
||||
if ( bli_obj_is_row_tilted( *c ) ) her2_cntl = her2_cntl_ge_row;
|
||||
else her2_cntl = her2_cntl_ge_col;
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lrow_ucol;
|
||||
else her2_cntl = her2_cntl_ge_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lcol_urow;
|
||||
else her2_cntl = her2_cntl_ge_lrow_ucol;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -43,11 +43,11 @@ extern ger_t* ger_cntl_cp_bs_col;
|
||||
|
||||
static blksz_t* her2_mc;
|
||||
|
||||
her2_t* her2_cntl_bs_ke_row;
|
||||
her2_t* her2_cntl_bs_ke_col;
|
||||
her2_t* her2_cntl_bs_ke_lrow_ucol;
|
||||
her2_t* her2_cntl_bs_ke_lcol_urow;
|
||||
|
||||
her2_t* her2_cntl_ge_row;
|
||||
her2_t* her2_cntl_ge_col;
|
||||
her2_t* her2_cntl_ge_lrow_ucol;
|
||||
her2_t* her2_cntl_ge_lcol_urow;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
@@ -69,14 +69,14 @@ void bli_her2_cntl_init()
|
||||
|
||||
// Create control trees for the lowest-level kernels. These trees induce
|
||||
// operations on (persumably) relatively small block-subvector problems.
|
||||
her2_cntl_bs_ke_row
|
||||
her2_cntl_bs_ke_lrow_ucol
|
||||
=
|
||||
bli_her2_cntl_obj_create( BLIS_UNB_FUSED,
|
||||
BLIS_VARIANT1,
|
||||
NULL, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
NULL, NULL );
|
||||
her2_cntl_bs_ke_col
|
||||
her2_cntl_bs_ke_lcol_urow
|
||||
=
|
||||
bli_her2_cntl_obj_create( BLIS_UNB_FUSED,
|
||||
BLIS_VARIANT4,
|
||||
@@ -88,38 +88,38 @@ void bli_her2_cntl_init()
|
||||
// Create control trees for generally large problems. Here, we choose
|
||||
// variants that partition for ger subproblems in the same direction
|
||||
// as the assumed storage.
|
||||
her2_cntl_ge_row
|
||||
her2_cntl_ge_lrow_ucol
|
||||
=
|
||||
bli_her2_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1, // use var1 for row storage
|
||||
BLIS_VARIANT1,
|
||||
her2_mc,
|
||||
packv_cntl, // pack x1 (if needed)
|
||||
packv_cntl, // pack y1 (if needed)
|
||||
packm_cntl_noscale, // pack C11 (if needed)
|
||||
ger_cntl_rp_bs_row,
|
||||
ger_cntl_rp_bs_row,
|
||||
her2_cntl_bs_ke_row,
|
||||
her2_cntl_bs_ke_lrow_ucol,
|
||||
unpackm_cntl ); // unpack C11 (if packed)
|
||||
her2_cntl_ge_col
|
||||
her2_cntl_ge_lcol_urow
|
||||
=
|
||||
bli_her2_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4, // use var4 for col storage
|
||||
BLIS_VARIANT4,
|
||||
her2_mc,
|
||||
packv_cntl, // pack x1 (if needed)
|
||||
packv_cntl, // pack y1 (if needed)
|
||||
packm_cntl_noscale, // pack C11 (if needed)
|
||||
ger_cntl_cp_bs_col,
|
||||
ger_cntl_cp_bs_col,
|
||||
her2_cntl_bs_ke_col,
|
||||
her2_cntl_bs_ke_lcol_urow,
|
||||
unpackm_cntl ); // unpack C11 (if packed)
|
||||
}
|
||||
|
||||
void bli_her2_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( her2_cntl_bs_ke_row );
|
||||
bli_cntl_obj_free( her2_cntl_bs_ke_col );
|
||||
bli_cntl_obj_free( her2_cntl_ge_row );
|
||||
bli_cntl_obj_free( her2_cntl_ge_col );
|
||||
bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_obj_free( her2_cntl_ge_lrow_ucol );
|
||||
bli_cntl_obj_free( her2_cntl_ge_lcol_urow );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -74,10 +74,10 @@ void bli_her2_int( conj_t conjh,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_her2_int_check( conjh, alpha, x, y, c, cntl );
|
||||
|
||||
// Return early if one of the operands has a zero dimension.
|
||||
// If C, x, or y has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *y ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
|
||||
// Alias the operands in case we need to apply conjugations.
|
||||
bli_obj_alias_to( *x, x_local );
|
||||
|
||||
@@ -34,10 +34,10 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern her_t* her_cntl_bs_ke_row;
|
||||
extern her_t* her_cntl_bs_ke_col;
|
||||
extern her_t* her_cntl_ge_row;
|
||||
extern her_t* her_cntl_ge_col;
|
||||
extern her_t* her_cntl_bs_ke_lrow_ucol;
|
||||
extern her_t* her_cntl_bs_ke_lcol_urow;
|
||||
extern her_t* her_cntl_ge_lrow_ucol;
|
||||
extern her_t* her_cntl_ge_lcol_urow;
|
||||
|
||||
void bli_syr( obj_t* alpha,
|
||||
obj_t* x,
|
||||
@@ -81,10 +81,20 @@ void bli_syr( obj_t* alpha,
|
||||
if ( x_is_contig &&
|
||||
c_is_contig )
|
||||
{
|
||||
// Use different control trees depending on storage of the matrix
|
||||
// operand.
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_row;
|
||||
else her_cntl = her_cntl_bs_ke_col;
|
||||
// We use two control trees to handle the four cases corresponding to
|
||||
// combinations of upper/lower triangular storage and row/column-storage.
|
||||
// The row-stored lower triangular and column-stored upper triangular
|
||||
// trees are identical. Same for the remaining two trees.
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol;
|
||||
else her_cntl = her_cntl_bs_ke_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lcol_urow;
|
||||
else her_cntl = her_cntl_bs_ke_lrow_ucol;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -95,8 +105,16 @@ void bli_syr( obj_t* alpha,
|
||||
|
||||
// Here, we make a similar choice as above, except that (1) we look
|
||||
// at storage tilt, and (2) we choose a tree that performs blocking.
|
||||
if ( bli_obj_is_row_tilted( *c ) ) her_cntl = her_cntl_ge_row;
|
||||
else her_cntl = her_cntl_ge_col;
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lrow_ucol;
|
||||
else her_cntl = her_cntl_ge_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lcol_urow;
|
||||
else her_cntl = her_cntl_ge_lrow_ucol;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -34,10 +34,10 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern her2_t* her2_cntl_bs_ke_row;
|
||||
extern her2_t* her2_cntl_bs_ke_col;
|
||||
extern her2_t* her2_cntl_ge_row;
|
||||
extern her2_t* her2_cntl_ge_col;
|
||||
extern her2_t* her2_cntl_bs_ke_lrow_ucol;
|
||||
extern her2_t* her2_cntl_bs_ke_lcol_urow;
|
||||
extern her2_t* her2_cntl_ge_lrow_ucol;
|
||||
extern her2_t* her2_cntl_ge_lcol_urow;
|
||||
|
||||
void bli_syr2( obj_t* alpha,
|
||||
obj_t* x,
|
||||
@@ -86,10 +86,20 @@ void bli_syr2( obj_t* alpha,
|
||||
y_is_contig &&
|
||||
c_is_contig )
|
||||
{
|
||||
// Use different control trees depending on storage of the matrix
|
||||
// operand.
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_row;
|
||||
else her2_cntl = her2_cntl_bs_ke_col;
|
||||
// We use two control trees to handle the four cases corresponding to
|
||||
// combinations of upper/lower triangular storage and row/column-storage.
|
||||
// The row-stored lower triangular and column-stored upper triangular
|
||||
// trees are identical. Same for the remaining two trees.
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol;
|
||||
else her2_cntl = her2_cntl_bs_ke_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow;
|
||||
else her2_cntl = her2_cntl_bs_ke_lrow_ucol;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -101,8 +111,16 @@ void bli_syr2( obj_t* alpha,
|
||||
|
||||
// Here, we make a similar choice as above, except that (1) we look
|
||||
// at storage tilt, and (2) we choose a tree that performs blocking.
|
||||
if ( bli_obj_is_row_tilted( *c ) ) her2_cntl = her2_cntl_ge_row;
|
||||
else her2_cntl = her2_cntl_ge_col;
|
||||
if ( bli_obj_is_lower( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lrow_ucol;
|
||||
else her2_cntl = her2_cntl_ge_lcol_urow;
|
||||
}
|
||||
else // if ( bli_obj_is_upper( *c ) )
|
||||
{
|
||||
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lcol_urow;
|
||||
else her2_cntl = her2_cntl_ge_lrow_ucol;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -74,8 +74,7 @@ void bli_trmv_int( obj_t* alpha,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_trmv_int_check( alpha, a, x, cntl );
|
||||
|
||||
// Return early if one of the operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
// If x has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
|
||||
// Alias A in case we need to induce a transformation (ie: transposition).
|
||||
|
||||
@@ -168,7 +168,7 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
|
||||
i = iter; \
|
||||
n_ahead = m - iter - f; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
@@ -223,12 +223,12 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
i = m - iter; \
|
||||
n_ahead = i - f; \
|
||||
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
|
||||
A10 = a_cast + (i-f)*rs_at + (0 )*cs_at; \
|
||||
x1 = x_cast + (i-f)*incx; \
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
|
||||
i = m - iter - f; \
|
||||
n_ahead = i; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
A10 = a_cast + (i )*rs_at + (0 )*cs_at; \
|
||||
x1 = x_cast + (i )*incx; \
|
||||
x0 = x_cast + (0 )*incx; \
|
||||
\
|
||||
/* x1 = alpha * A11 * x1; */ \
|
||||
|
||||
@@ -167,7 +167,7 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
@@ -220,13 +220,13 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
i = m - iter; \
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
|
||||
i = m - iter - f; \
|
||||
n_behind = iter; \
|
||||
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
|
||||
A21 = a_cast + (i )*rs_at + (i-f)*cs_at; \
|
||||
x1 = x_cast + (i-f)*incx; \
|
||||
x2 = x_cast + (i )*incx; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
A21 = a_cast + (i+f)*rs_at + (i )*cs_at; \
|
||||
x1 = x_cast + (i )*incx; \
|
||||
x2 = x_cast + (i+f)*incx; \
|
||||
\
|
||||
/* x2 = x2 + alpha * A21 * x1; */ \
|
||||
PASTEMAC3(cha,chx,chx,kername)( conja, \
|
||||
|
||||
@@ -74,8 +74,7 @@ void bli_trsv_int( obj_t* alpha,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_trsv_int_check( alpha, a, x, cntl );
|
||||
|
||||
// Return early if one of the operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
// If x has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *x ) ) return;
|
||||
|
||||
// Alias A in case we need to induce a transformation (ie: transposition).
|
||||
|
||||
@@ -175,13 +175,13 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
i = m - iter; \
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
|
||||
i = m - iter - f; \
|
||||
n_behind = iter; \
|
||||
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
|
||||
A12 = a_cast + (i-f)*rs_at + (i )*cs_at; \
|
||||
x1 = x_cast + (i-f)*incx; \
|
||||
x2 = x_cast + (i )*incx; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
A12 = a_cast + (i )*rs_at + (i+f)*cs_at; \
|
||||
x1 = x_cast + (i )*incx; \
|
||||
x2 = x_cast + (i+f)*incx; \
|
||||
\
|
||||
/* x1 = x1 - A12 * x2; */ \
|
||||
PASTEMAC3(cha,chx,chx,kername)( conja, \
|
||||
@@ -231,7 +231,7 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
|
||||
@@ -174,12 +174,12 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
i = m - iter; \
|
||||
n_ahead = i - f; \
|
||||
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
|
||||
A01 = a_cast + (0 )*rs_at + (i-f)*cs_at; \
|
||||
x1 = x_cast + (i-f)*incx; \
|
||||
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
|
||||
i = m - iter - f; \
|
||||
n_ahead = i; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
A01 = a_cast + (0 )*rs_at + (i )*cs_at; \
|
||||
x1 = x_cast + (i )*incx; \
|
||||
x0 = x_cast + (0 )*incx; \
|
||||
\
|
||||
/* x1 = x1 / triu( A11 ); */ \
|
||||
@@ -228,7 +228,7 @@ void PASTEMAC2(cha,chx,varname)( \
|
||||
{ \
|
||||
for ( iter = 0; iter < m; iter += f ) \
|
||||
{ \
|
||||
f = bli_min( m - iter, b_fuse ); \
|
||||
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
|
||||
i = iter; \
|
||||
n_ahead = m - iter - f; \
|
||||
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
|
||||
|
||||
@@ -70,11 +70,17 @@ void bli_gemm_int( obj_t* alpha,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_int_check( alpha, a, b, beta, c, cntl );
|
||||
|
||||
// Return early if one of the matrix operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *b ) ) return;
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C in case we need to induce a transposition.
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
|
||||
@@ -88,12 +88,18 @@ void bli_her2k_int( obj_t* alpha,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_her2k_int_check( alpha, a, bh, alpha_conj, b, ah, beta, c, cntl );
|
||||
|
||||
// Return early if one of the matrix operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *bh ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *b ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *ah ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *ah ) ||
|
||||
bli_obj_has_zero_dim( *b ) ||
|
||||
bli_obj_has_zero_dim( *bh ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C in case we need to induce a transposition.
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
@@ -82,10 +82,16 @@ void bli_herk_int( obj_t* alpha,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_herk_int_check( alpha, a, ah, beta, c, cntl );
|
||||
|
||||
// Return early if one of the matrix operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *ah ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
|
||||
// If A or A' has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *ah ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C in case we need to induce a transposition.
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
@@ -83,11 +83,17 @@ void bli_trmm_int( side_t side,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_trmm_int_check( side, alpha, a, b, beta, c, cntl );
|
||||
|
||||
// Return early if one of the matrix operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *b ) ) return;
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C in case we need to induce a transposition.
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
|
||||
@@ -83,11 +83,17 @@ void bli_trsm_int( side_t side,
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_trsm_int_check( side, alpha, a, b, beta, c, cntl );
|
||||
|
||||
// Return early if one of the matrix operands has a zero dimension.
|
||||
if ( bli_obj_has_zero_dim( *a ) ) return;
|
||||
if ( bli_obj_has_zero_dim( *b ) ) return;
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( *c ) ) return;
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C in case we need to induce a transposition.
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
|
||||
@@ -485,8 +485,10 @@ bli_obj_width_stored( obj )
|
||||
|
||||
#define bli_obj_vector_inc( x ) \
|
||||
\
|
||||
( bli_obj_is_scalar( x ) ? 1 : \
|
||||
( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) \
|
||||
: bli_obj_row_stride( x ) )
|
||||
: bli_obj_row_stride( x ) ) \
|
||||
)
|
||||
|
||||
#define bli_obj_is_vector( x ) \
|
||||
\
|
||||
@@ -506,6 +508,11 @@ bli_obj_width_stored( obj )
|
||||
( bli_obj_length( obj ) == 0 || \
|
||||
bli_obj_width( obj ) == 0 )
|
||||
|
||||
#define bli_obj_is_scalar( x ) \
|
||||
\
|
||||
( bli_obj_length( x ) == 1 && \
|
||||
bli_obj_width( x ) == 1 )
|
||||
|
||||
|
||||
// Dimension modification
|
||||
|
||||
|
||||
@@ -314,6 +314,19 @@
|
||||
else { mt = n; nt = m; rst = cs; cst = rs; } \
|
||||
}
|
||||
|
||||
|
||||
// blocksize-related
|
||||
|
||||
#define bli_determine_blocksize_dim_f( i, dim, b_alg ) \
|
||||
\
|
||||
( bli_min( b_alg, dim - i ) )
|
||||
|
||||
#define bli_determine_blocksize_dim_b( i, dim, b_alg ) \
|
||||
\
|
||||
( i == 0 && dim % b_alg != 0 ? dim % b_alg \
|
||||
: b_alg )
|
||||
|
||||
|
||||
// stride-related
|
||||
|
||||
#define bli_vector_inc( trans, m, n, rs, cs ) \
|
||||
|
||||
Reference in New Issue
Block a user