Fixed x86_64 kernel bugs and other minor issues.

Details:
- Fixed bugs in trmv_l and trsv_u due to backwards iteration resulting in
  unaligned subpartitions. We were already going out of our way a bit to
  handle edge cases in the first iteration for blocked variants, and this
  was simply the unblocked-fused extension of that idea.
- Fixed control tree handling in her/her2/syr/syr2 that was not taking
  into account how the choice of variant needed to be altered for
  upper-stored matrices (given that only lower-stored algorithms are
  explicitly implemented).
- Added bli_determine_blocksize_dim_f(), bli_determine_blocksize_dim_b()
  macros to provide inlined versions of bli_determine_blocksize_[fb]() for
  use by unblocked-fused variants.
- Integrated new blocksize_dim macros into gemv/hemv unf variants for
  consistency with that of the bugfix for trmv/trsv (both of which now
  use the same macros).
- Modified bli_obj_vector_inc() so that 1 is returned if the object is a
  vector of length 1 (ie: 1 x 1). This fixes a bug whereby under certain
  conditions (e.g. dotv_opt_var1), an invalid increment was returned, which
  was invalid only because the code was expecting 1 (for purposes of
  performing contiguous vector loads) but got a value greater than 1 because
  the column stride of the object (e.g. rho) was inflated for alignment
  purposes (albeit unnecessarily since there is only one element in the
  object).
- Replaced some old invocations of set0 with set0s.
- Added alpha parameter to gemmtrsm ukernels for x86_64 and use accordingly.
- Fixed increment bug in cleanup loop of gemm ukernel for x86_64.
- Added safeguard to test modules so that testing a problem with a zero
  dimension does not result in a failure.
- Tweaked handling of zero dimensions in level-2 and level-3 operations'
  internal back-ends to correctly handle cases where output operand still
  needs to be scaled (e.g. by beta, in the case of gemm with k = 0).
This commit is contained in:
Field G. Van Zee
2013-05-24 16:28:10 -05:00
parent d57ec42b34
commit 2d9c667f3c
82 changed files with 480 additions and 175 deletions

View File

@@ -75,11 +75,17 @@ void bli_gemv_int( trans_t transa,
if ( bli_error_checking_is_enabled() )
bli_gemv_int_check( alpha, &a_local, &x_local, beta, y, cntl );
// Return early if one of the operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
if ( bli_obj_has_zero_dim( *x ) ) return;
// If y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *y ) ) return;
// If A or x has a zero dimension, scale y by beta and return early.
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *x ) )
{
bli_scalm( beta, y );
return;
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );

View File

@@ -164,7 +164,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_min( n_iter - i, b_fuse ); \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a_cast + (i )*rs_at + (0 )*cs_at; \
x1 = x_cast + (0 )*incy; \

View File

@@ -182,7 +182,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_min( n_iter - i, b_fuse ); \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a_cast + (0 )*rs_at + (i )*cs_at; \
x1 = x_cast + (i )*incx; \

View File

@@ -71,11 +71,16 @@ void bli_ger_int( conj_t conjx,
if ( bli_error_checking_is_enabled() )
bli_ger_int_check( alpha, x, y, a, cntl );
// Return early if one of the operands has a zero dimension.
if ( bli_obj_has_zero_dim( *x ) ) return;
if ( bli_obj_has_zero_dim( *y ) ) return;
// If A has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *a ) ) return;
// If x or y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *x ) ||
bli_obj_has_zero_dim( *y ) )
{
return;
}
// Alias the objects, applying conjx and conjy to x and y, respectively.
bli_obj_alias_with_conj( conjx, *x, x_local );
bli_obj_alias_with_conj( conjy, *y, y_local );

View File

@@ -70,11 +70,17 @@ void bli_hemv_int( conj_t conjh,
if ( bli_error_checking_is_enabled() )
bli_hemv_int_check( conjh, alpha, a, x, beta, y, cntl );
// Return early if one of the operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
if ( bli_obj_has_zero_dim( *x ) ) return;
// If y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *y ) ) return;
// If A or x has a zero dimension, scale y by beta and return early.
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *x ) )
{
bli_scalm( beta, y );
return;
}
// Alias A in case we need to induce the upper triangular case.
bli_obj_alias_to( *a, a_local );

View File

@@ -215,7 +215,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
\
for ( i = 0; i < m; i += f ) \
{ \
f = bli_min( m - i, b_fuse ); \
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
n_behind = i; \
A10 = a_cast + (i )*rs_at + (0 )*cs_at; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \

View File

@@ -233,7 +233,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
\
for ( i = 0; i < m; i += f ) \
{ \
f = bli_min( m - i, b_fuse ); \
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
n_ahead = m - i - f; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
A21 = a_cast + (i+f)*rs_at + (i )*cs_at; \

View File

@@ -34,10 +34,10 @@
#include "blis.h"
extern her_t* her_cntl_bs_ke_row;
extern her_t* her_cntl_bs_ke_col;
extern her_t* her_cntl_ge_row;
extern her_t* her_cntl_ge_col;
extern her_t* her_cntl_bs_ke_lrow_ucol;
extern her_t* her_cntl_bs_ke_lcol_urow;
extern her_t* her_cntl_ge_lrow_ucol;
extern her_t* her_cntl_ge_lcol_urow;
void bli_her( obj_t* alpha,
obj_t* x,
@@ -79,10 +79,20 @@ void bli_her( obj_t* alpha,
if ( x_is_contig &&
c_is_contig )
{
// Use different control trees depending on storage of the matrix
// operand.
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_row;
else her_cntl = her_cntl_bs_ke_col;
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol;
else her_cntl = her_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lcol_urow;
else her_cntl = her_cntl_bs_ke_lrow_ucol;
}
}
else
{
@@ -93,8 +103,16 @@ void bli_her( obj_t* alpha,
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_row_tilted( *c ) ) her_cntl = her_cntl_ge_row;
else her_cntl = her_cntl_ge_col;
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lrow_ucol;
else her_cntl = her_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lcol_urow;
else her_cntl = her_cntl_ge_lrow_ucol;
}
}

View File

@@ -45,11 +45,11 @@ extern ger_t* ger_cntl_bs_ke_col;
static blksz_t* her_mc;
her_t* her_cntl_bs_ke_row;
her_t* her_cntl_bs_ke_col;
her_t* her_cntl_bs_ke_lrow_ucol;
her_t* her_cntl_bs_ke_lcol_urow;
her_t* her_cntl_ge_row;
her_t* her_cntl_ge_col;
her_t* her_cntl_ge_lrow_ucol;
her_t* her_cntl_ge_lcol_urow;
// Cache blocksizes.
@@ -71,13 +71,13 @@ void bli_her_cntl_init()
// Create control trees for the lowest-level kernels. These trees induce
// operations on (persumably) relatively small block-subvector problems.
her_cntl_bs_ke_row
her_cntl_bs_ke_lrow_ucol
=
bli_her_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
NULL, NULL, NULL,
NULL, NULL, NULL );
her_cntl_bs_ke_col
her_cntl_bs_ke_lcol_urow
=
bli_her_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT2,
@@ -88,34 +88,34 @@ void bli_her_cntl_init()
// Create control trees for generally large problems. Here, we choose
// variants that partition for ger subproblems in the same direction
// as the assumed storage.
her_cntl_ge_row
her_cntl_ge_lrow_ucol
=
bli_her_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1, // use var1 for row storage
BLIS_VARIANT1,
her_mc,
packv_cntl, // pack x1 (if needed)
NULL, // do NOT pack C11
ger_cntl_rp_bs_row,
her_cntl_bs_ke_row,
her_cntl_bs_ke_lrow_ucol,
NULL ); // no unpacking needed
her_cntl_ge_col
her_cntl_ge_lcol_urow
=
bli_her_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2, // use var2 for col storage
BLIS_VARIANT2,
her_mc,
packv_cntl, // pack x1 (if needed)
NULL, // do NOT pack C11
ger_cntl_cp_bs_col,
her_cntl_bs_ke_col,
her_cntl_bs_ke_lcol_urow,
NULL ); // no unpacking needed
}
void bli_her_cntl_finalize()
{
bli_cntl_obj_free( her_cntl_bs_ke_row );
bli_cntl_obj_free( her_cntl_bs_ke_col );
bli_cntl_obj_free( her_cntl_ge_row );
bli_cntl_obj_free( her_cntl_ge_col );
bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol );
bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow );
bli_cntl_obj_free( her_cntl_ge_lrow_ucol );
bli_cntl_obj_free( her_cntl_ge_lcol_urow );
}

View File

@@ -67,9 +67,9 @@ void bli_her_int( conj_t conjh,
if ( bli_error_checking_is_enabled() )
bli_her_int_check( conjh, alpha, x, c, cntl );
// Return early if one of the operands has a zero dimension.
if ( bli_obj_has_zero_dim( *x ) ) return;
// If C or x has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
if ( bli_obj_has_zero_dim( *x ) ) return;
// Alias the operands in case we need to apply conjugations.
bli_obj_alias_to( *x, x_local );

View File

@@ -34,10 +34,10 @@
#include "blis.h"
extern her2_t* her2_cntl_bs_ke_row;
extern her2_t* her2_cntl_bs_ke_col;
extern her2_t* her2_cntl_ge_row;
extern her2_t* her2_cntl_ge_col;
extern her2_t* her2_cntl_bs_ke_lrow_ucol;
extern her2_t* her2_cntl_bs_ke_lcol_urow;
extern her2_t* her2_cntl_ge_lrow_ucol;
extern her2_t* her2_cntl_ge_lcol_urow;
void bli_her2( obj_t* alpha,
obj_t* x,
@@ -93,10 +93,20 @@ void bli_her2( obj_t* alpha,
y_is_contig &&
c_is_contig )
{
// Use different control trees depending on storage of the matrix
// operand.
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_row;
else her2_cntl = her2_cntl_bs_ke_col;
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol;
else her2_cntl = her2_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow;
else her2_cntl = her2_cntl_bs_ke_lrow_ucol;
}
}
else
{
@@ -108,8 +118,16 @@ void bli_her2( obj_t* alpha,
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_row_tilted( *c ) ) her2_cntl = her2_cntl_ge_row;
else her2_cntl = her2_cntl_ge_col;
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lrow_ucol;
else her2_cntl = her2_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lcol_urow;
else her2_cntl = her2_cntl_ge_lrow_ucol;
}
}

View File

@@ -43,11 +43,11 @@ extern ger_t* ger_cntl_cp_bs_col;
static blksz_t* her2_mc;
her2_t* her2_cntl_bs_ke_row;
her2_t* her2_cntl_bs_ke_col;
her2_t* her2_cntl_bs_ke_lrow_ucol;
her2_t* her2_cntl_bs_ke_lcol_urow;
her2_t* her2_cntl_ge_row;
her2_t* her2_cntl_ge_col;
her2_t* her2_cntl_ge_lrow_ucol;
her2_t* her2_cntl_ge_lcol_urow;
// Cache blocksizes.
@@ -69,14 +69,14 @@ void bli_her2_cntl_init()
// Create control trees for the lowest-level kernels. These trees induce
// operations on (persumably) relatively small block-subvector problems.
her2_cntl_bs_ke_row
her2_cntl_bs_ke_lrow_ucol
=
bli_her2_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT1,
NULL, NULL, NULL,
NULL, NULL, NULL,
NULL, NULL );
her2_cntl_bs_ke_col
her2_cntl_bs_ke_lcol_urow
=
bli_her2_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT4,
@@ -88,38 +88,38 @@ void bli_her2_cntl_init()
// Create control trees for generally large problems. Here, we choose
// variants that partition for ger subproblems in the same direction
// as the assumed storage.
her2_cntl_ge_row
her2_cntl_ge_lrow_ucol
=
bli_her2_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1, // use var1 for row storage
BLIS_VARIANT1,
her2_mc,
packv_cntl, // pack x1 (if needed)
packv_cntl, // pack y1 (if needed)
packm_cntl_noscale, // pack C11 (if needed)
ger_cntl_rp_bs_row,
ger_cntl_rp_bs_row,
her2_cntl_bs_ke_row,
her2_cntl_bs_ke_lrow_ucol,
unpackm_cntl ); // unpack C11 (if packed)
her2_cntl_ge_col
her2_cntl_ge_lcol_urow
=
bli_her2_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4, // use var4 for col storage
BLIS_VARIANT4,
her2_mc,
packv_cntl, // pack x1 (if needed)
packv_cntl, // pack y1 (if needed)
packm_cntl_noscale, // pack C11 (if needed)
ger_cntl_cp_bs_col,
ger_cntl_cp_bs_col,
her2_cntl_bs_ke_col,
her2_cntl_bs_ke_lcol_urow,
unpackm_cntl ); // unpack C11 (if packed)
}
void bli_her2_cntl_finalize()
{
bli_cntl_obj_free( her2_cntl_bs_ke_row );
bli_cntl_obj_free( her2_cntl_bs_ke_col );
bli_cntl_obj_free( her2_cntl_ge_row );
bli_cntl_obj_free( her2_cntl_ge_col );
bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol );
bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow );
bli_cntl_obj_free( her2_cntl_ge_lrow_ucol );
bli_cntl_obj_free( her2_cntl_ge_lcol_urow );
}

View File

@@ -74,10 +74,10 @@ void bli_her2_int( conj_t conjh,
if ( bli_error_checking_is_enabled() )
bli_her2_int_check( conjh, alpha, x, y, c, cntl );
// Return early if one of the operands has a zero dimension.
// If C, x, or y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
if ( bli_obj_has_zero_dim( *x ) ) return;
if ( bli_obj_has_zero_dim( *y ) ) return;
if ( bli_obj_has_zero_dim( *c ) ) return;
// Alias the operands in case we need to apply conjugations.
bli_obj_alias_to( *x, x_local );

View File

@@ -34,10 +34,10 @@
#include "blis.h"
extern her_t* her_cntl_bs_ke_row;
extern her_t* her_cntl_bs_ke_col;
extern her_t* her_cntl_ge_row;
extern her_t* her_cntl_ge_col;
extern her_t* her_cntl_bs_ke_lrow_ucol;
extern her_t* her_cntl_bs_ke_lcol_urow;
extern her_t* her_cntl_ge_lrow_ucol;
extern her_t* her_cntl_ge_lcol_urow;
void bli_syr( obj_t* alpha,
obj_t* x,
@@ -81,10 +81,20 @@ void bli_syr( obj_t* alpha,
if ( x_is_contig &&
c_is_contig )
{
// Use different control trees depending on storage of the matrix
// operand.
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_row;
else her_cntl = her_cntl_bs_ke_col;
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol;
else her_cntl = her_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_bs_ke_lcol_urow;
else her_cntl = her_cntl_bs_ke_lrow_ucol;
}
}
else
{
@@ -95,8 +105,16 @@ void bli_syr( obj_t* alpha,
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_row_tilted( *c ) ) her_cntl = her_cntl_ge_row;
else her_cntl = her_cntl_ge_col;
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lrow_ucol;
else her_cntl = her_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her_cntl = her_cntl_ge_lcol_urow;
else her_cntl = her_cntl_ge_lrow_ucol;
}
}

View File

@@ -34,10 +34,10 @@
#include "blis.h"
extern her2_t* her2_cntl_bs_ke_row;
extern her2_t* her2_cntl_bs_ke_col;
extern her2_t* her2_cntl_ge_row;
extern her2_t* her2_cntl_ge_col;
extern her2_t* her2_cntl_bs_ke_lrow_ucol;
extern her2_t* her2_cntl_bs_ke_lcol_urow;
extern her2_t* her2_cntl_ge_lrow_ucol;
extern her2_t* her2_cntl_ge_lcol_urow;
void bli_syr2( obj_t* alpha,
obj_t* x,
@@ -86,10 +86,20 @@ void bli_syr2( obj_t* alpha,
y_is_contig &&
c_is_contig )
{
// Use different control trees depending on storage of the matrix
// operand.
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_row;
else her2_cntl = her2_cntl_bs_ke_col;
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol;
else her2_cntl = her2_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow;
else her2_cntl = her2_cntl_bs_ke_lrow_ucol;
}
}
else
{
@@ -101,8 +111,16 @@ void bli_syr2( obj_t* alpha,
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_row_tilted( *c ) ) her2_cntl = her2_cntl_ge_row;
else her2_cntl = her2_cntl_ge_col;
if ( bli_obj_is_lower( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lrow_ucol;
else her2_cntl = her2_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( *c ) )
{
if ( bli_obj_is_row_stored( *c ) ) her2_cntl = her2_cntl_ge_lcol_urow;
else her2_cntl = her2_cntl_ge_lrow_ucol;
}
}

View File

@@ -74,8 +74,7 @@ void bli_trmv_int( obj_t* alpha,
if ( bli_error_checking_is_enabled() )
bli_trmv_int_check( alpha, a, x, cntl );
// Return early if one of the operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
// If x has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *x ) ) return;
// Alias A in case we need to induce a transformation (ie: transposition).

View File

@@ -168,7 +168,7 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_ahead = m - iter - f; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
@@ -223,12 +223,12 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
i = m - iter; \
n_ahead = i - f; \
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
A10 = a_cast + (i-f)*rs_at + (0 )*cs_at; \
x1 = x_cast + (i-f)*incx; \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_ahead = i; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
A10 = a_cast + (i )*rs_at + (0 )*cs_at; \
x1 = x_cast + (i )*incx; \
x0 = x_cast + (0 )*incx; \
\
/* x1 = alpha * A11 * x1; */ \

View File

@@ -167,7 +167,7 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_behind = i; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
@@ -220,13 +220,13 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
i = m - iter; \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_behind = iter; \
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
A21 = a_cast + (i )*rs_at + (i-f)*cs_at; \
x1 = x_cast + (i-f)*incx; \
x2 = x_cast + (i )*incx; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
A21 = a_cast + (i+f)*rs_at + (i )*cs_at; \
x1 = x_cast + (i )*incx; \
x2 = x_cast + (i+f)*incx; \
\
/* x2 = x2 + alpha * A21 * x1; */ \
PASTEMAC3(cha,chx,chx,kername)( conja, \

View File

@@ -74,8 +74,7 @@ void bli_trsv_int( obj_t* alpha,
if ( bli_error_checking_is_enabled() )
bli_trsv_int_check( alpha, a, x, cntl );
// Return early if one of the operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
// If x has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *x ) ) return;
// Alias A in case we need to induce a transformation (ie: transposition).

View File

@@ -175,13 +175,13 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
i = m - iter; \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_behind = iter; \
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
A12 = a_cast + (i-f)*rs_at + (i )*cs_at; \
x1 = x_cast + (i-f)*incx; \
x2 = x_cast + (i )*incx; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
A12 = a_cast + (i )*rs_at + (i+f)*cs_at; \
x1 = x_cast + (i )*incx; \
x2 = x_cast + (i+f)*incx; \
\
/* x1 = x1 - A12 * x2; */ \
PASTEMAC3(cha,chx,chx,kername)( conja, \
@@ -231,7 +231,7 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_behind = i; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \

View File

@@ -174,12 +174,12 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
i = m - iter; \
n_ahead = i - f; \
A11 = a_cast + (i-f)*rs_at + (i-f)*cs_at; \
A01 = a_cast + (0 )*rs_at + (i-f)*cs_at; \
x1 = x_cast + (i-f)*incx; \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_ahead = i; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \
A01 = a_cast + (0 )*rs_at + (i )*cs_at; \
x1 = x_cast + (i )*incx; \
x0 = x_cast + (0 )*incx; \
\
/* x1 = x1 / triu( A11 ); */ \
@@ -228,7 +228,7 @@ void PASTEMAC2(cha,chx,varname)( \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_min( m - iter, b_fuse ); \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_ahead = m - iter - f; \
A11 = a_cast + (i )*rs_at + (i )*cs_at; \

View File

@@ -70,11 +70,17 @@ void bli_gemm_int( obj_t* alpha,
if ( bli_error_checking_is_enabled() )
bli_gemm_int_check( alpha, a, b, beta, c, cntl );
// Return early if one of the matrix operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
if ( bli_obj_has_zero_dim( *b ) ) return;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
return;
}
// Alias C in case we need to induce a transposition.
bli_obj_alias_to( *c, c_local );

View File

@@ -88,12 +88,18 @@ void bli_her2k_int( obj_t* alpha,
if ( bli_error_checking_is_enabled() )
bli_her2k_int_check( alpha, a, bh, alpha_conj, b, ah, beta, c, cntl );
// Return early if one of the matrix operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
if ( bli_obj_has_zero_dim( *bh ) ) return;
if ( bli_obj_has_zero_dim( *b ) ) return;
if ( bli_obj_has_zero_dim( *ah ) ) return;
if ( bli_obj_has_zero_dim( *c ) ) return;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *ah ) ||
bli_obj_has_zero_dim( *b ) ||
bli_obj_has_zero_dim( *bh ) )
{
bli_scalm( beta, c );
return;
}
// Alias C in case we need to induce a transposition.
bli_obj_alias_to( *c, c_local );

View File

@@ -82,10 +82,16 @@ void bli_herk_int( obj_t* alpha,
if ( bli_error_checking_is_enabled() )
bli_herk_int_check( alpha, a, ah, beta, c, cntl );
// Return early if one of the matrix operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
if ( bli_obj_has_zero_dim( *ah ) ) return;
if ( bli_obj_has_zero_dim( *c ) ) return;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
// If A or A' has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *ah ) )
{
bli_scalm( beta, c );
return;
}
// Alias C in case we need to induce a transposition.
bli_obj_alias_to( *c, c_local );

View File

@@ -83,11 +83,17 @@ void bli_trmm_int( side_t side,
if ( bli_error_checking_is_enabled() )
bli_trmm_int_check( side, alpha, a, b, beta, c, cntl );
// Return early if one of the matrix operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
if ( bli_obj_has_zero_dim( *b ) ) return;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
return;
}
// Alias C in case we need to induce a transposition.
bli_obj_alias_to( *c, c_local );

View File

@@ -83,11 +83,17 @@ void bli_trsm_int( side_t side,
if ( bli_error_checking_is_enabled() )
bli_trsm_int_check( side, alpha, a, b, beta, c, cntl );
// Return early if one of the matrix operands has a zero dimension.
if ( bli_obj_has_zero_dim( *a ) ) return;
if ( bli_obj_has_zero_dim( *b ) ) return;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
return;
}
// Alias C in case we need to induce a transposition.
bli_obj_alias_to( *c, c_local );

View File

@@ -485,8 +485,10 @@ bli_obj_width_stored( obj )
#define bli_obj_vector_inc( x ) \
\
( bli_obj_is_scalar( x ) ? 1 : \
( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) \
: bli_obj_row_stride( x ) )
: bli_obj_row_stride( x ) ) \
)
#define bli_obj_is_vector( x ) \
\
@@ -506,6 +508,11 @@ bli_obj_width_stored( obj )
( bli_obj_length( obj ) == 0 || \
bli_obj_width( obj ) == 0 )
#define bli_obj_is_scalar( x ) \
\
( bli_obj_length( x ) == 1 && \
bli_obj_width( x ) == 1 )
// Dimension modification

View File

@@ -314,6 +314,19 @@
else { mt = n; nt = m; rst = cs; cst = rs; } \
}
// blocksize-related
#define bli_determine_blocksize_dim_f( i, dim, b_alg ) \
\
( bli_min( b_alg, dim - i ) )
#define bli_determine_blocksize_dim_b( i, dim, b_alg ) \
\
( i == 0 && dim % b_alg != 0 ? dim % b_alg \
: b_alg )
// stride-related
#define bli_vector_inc( trans, m, n, rs, cs ) \