Various level-3 optimizations for row storage.

Details:
- Implemented remaining two cases within bli_packm_blk_var2(), which allow
  packing from a lower or upper-stored symmetric/Hermitian matrix to column
  panels (which are row-stored). Previously one could only pack to row panels
  (which are column-stored).
- Implemented various optimizations in the level-3 front-ends that allow more
  favorable access through row-stored matrices for gemm, hemm, herk, her2k,
  symm, syrk, and syr2k.
- Cleaned up code in level-3 front-ends that has to do with setting target and
  execution datatypes.
This commit is contained in:
Field G. Van Zee
2013-06-11 12:18:39 -05:00
parent 05a657a6b9
commit 08475e7c76
21 changed files with 871 additions and 408 deletions

View File

@@ -143,6 +143,7 @@ void PASTEMAC(ch,varname )( \
dim_t panel_len; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
doff_t diagoffc_i_abs; \
dim_t panel_dim_i; \
inc_t vs_c; \
inc_t incc, ldc; \
@@ -171,6 +172,7 @@ void PASTEMAC(ch,varname )( \
ctype* restrict p11; \
dim_t p11_m; \
dim_t p11_n; \
inc_t rs_p11, cs_p11; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
@@ -191,7 +193,7 @@ void PASTEMAC(ch,varname )( \
we are packing to row panels. */ \
if ( bli_is_row_stored( rs_p, cs_p ) ) \
{ \
/* Prepare to pack to column panels. */ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len = m; \
panel_dim = pd_p; \
@@ -204,10 +206,12 @@ void PASTEMAC(ch,varname )( \
n_panel = &panel_dim_i; \
m_panel_max = m_max; \
n_panel_max = panel_dim; \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored( rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to row panels. */ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len = n; \
panel_dim = pd_p; \
@@ -220,6 +224,8 @@ void PASTEMAC(ch,varname )( \
n_panel = &n; \
m_panel_max = panel_dim; \
n_panel_max = n_max; \
rs_p11 = 1; \
cs_p11 = cs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
@@ -237,11 +243,11 @@ void PASTEMAC(ch,varname )( \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \
panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
p_begin = p_cast + (ip )*ps_p; \
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
p_begin = p_cast + (ip )*ps_p; \
\
/* If the current panel intersects the diagonal and C is either
upper- or lower-stored, then we assume C is symmetric or
@@ -252,17 +258,76 @@ void PASTEMAC(ch,varname )( \
if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
bli_is_upper_or_lower( uploc ) ) \
{ \
/* Only two of four cases implemented, since BLIS2 currently does
not support triangular packing of matrix B (which is row-stored). */ \
/*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
diagoffc_i_abs = bli_abs( diagoffc_i ); \
\
/*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) */ \
if ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
p10_dim = panel_dim_i; \
p10_len = diagoffc_i_abs; \
p10 = p_begin; \
c10 = c_begin; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim_i; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_i_abs - j; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
p11_m = panel_dim_i; \
p11_n = panel_dim_i; \
j = diagoffc_i_abs; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \
/*else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \
else /* if ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
p10_dim = panel_dim_i; \
p10_len = diagoffc_i_abs + panel_dim_i; \
diagoffc10 = diagoffc_i; \
p10 = p_begin; \
c10 = c_begin; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim_i; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
p11_m = panel_dim_i; \
p11_n = panel_dim_i; \
j = diagoffc_i_abs; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
else*/ if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
/*
else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
{ \
p10_dim = panel_dim_i; \
p10_len = diagoffc_i + panel_dim_i; \
@@ -279,7 +344,7 @@ void PASTEMAC(ch,varname )( \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*cs_c; \
c12 = c_begin + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
@@ -288,12 +353,12 @@ void PASTEMAC(ch,varname )( \
p11_n = panel_dim_i; \
j = diagoffc_i; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*cs_c; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
else /* if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \
else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \
{ \
p10_dim = panel_dim_i; \
p10_len = diagoffc_i; \
@@ -308,7 +373,7 @@ void PASTEMAC(ch,varname )( \
j = p10_len; \
diagoffc12 = diagoffc_i - j; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*cs_c; \
c12 = c_begin + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
@@ -319,11 +384,12 @@ void PASTEMAC(ch,varname )( \
p11_n = panel_dim_i; \
j = diagoffc_i; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*cs_c; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
*/ \
\
/* Pack to P10. For upper storage, this includes the unstored
triangle of C11. */ \
@@ -351,11 +417,16 @@ void PASTEMAC(ch,varname )( \
p11_m, \
p11_n, \
beta_cast, \
c11, rs_c, cs_c, \
p11, 1, ldp ); \
c11, rs_c, cs_c, \
p11, rs_p11, cs_p11 ); \
} \
else \
{ \
/* Note that the following code executes if the current panel either:
- does not intersect the diagonal, or
- does intersect the diagonal, BUT the matrix is general
which means the entire current panel can be copied at once. */ \
\
/* We use some c10-specific variables here because we might need
to change them if the current panel is unstored. (The values
below are used if the current panel is stored.) */ \
@@ -368,8 +439,9 @@ void PASTEMAC(ch,varname )( \
adjustments so we refer to the data where it is actually
stored, and so we take conjugation into account. (Note
this implicitly assumes we are operating on a symmetric or
Hermitian matrix.) */ \
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len ) ) \
Hermitian matrix, since a general matrix would not contain
any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \
{ \
c10 = c10 + diagoffc_i * ( doff_t )cs_c + \
-diagoffc_i * ( doff_t )rs_c; \
@@ -440,7 +512,7 @@ void PASTEMAC(ch,varname )( \
p_begin, 1, cs_p, "%4.1f", "" ); \
if ( cs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \
p_begin, panel_dim, 1, "%6.3f", "" ); \
p_begin, panel_dim, 1, "%8.5f", "" ); \
*/ \
} \
}

View File

@@ -271,7 +271,7 @@ void PASTEMAC(ch,varname )( \
} \
else if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) ) \
{ \
/* Only two of four cases implemented, since BLIS2 currently does
/* Only two of four cases implemented, since BLIS currently does
not support triangular packing of matrix B. */ \
/*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
{ \

View File

@@ -48,13 +48,12 @@ void bli_gemm( obj_t* alpha,
gemm_t* cntl;
obj_t alpha_local;
obj_t beta_local;
num_t dt_targ_a;
num_t dt_targ_b;
num_t dt_targ_c;
num_t dt_exec;
obj_t a_local;
obj_t b_local;
obj_t c_local;
num_t dt_alpha;
num_t dt_beta;
bool_t pack_c = FALSE;
bool_t pack_c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -67,105 +66,56 @@ void bli_gemm( obj_t* alpha,
return;
}
// Determine the target datatype of each matrix object.
bli_gemm_get_target_datatypes( a,
b,
c,
&dt_targ_a,
&dt_targ_b,
&dt_targ_c,
&pack_c );
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_b, *b );
bli_obj_set_target_datatype( dt_targ_c, *c );
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *b, b_local );
bli_obj_alias_to( *c, c_local );
// Determine the execution datatype. Generally speaking, the
// execution datatype is the real projection of the target datatype
// of c. This rule holds unless the target datatypes of a and b
// are both complex, in which case the execution datatype is also
// complex.
if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
dt_exec = dt_targ_c;
else
dt_exec = bli_datatype_proj_to_real( dt_targ_c );
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, *b );
bli_obj_set_execution_datatype( dt_exec, *c );
// Note that the precisions of the target datatypes of a, b, and c
// match. The domains, however, are not necessarily the same. There
// are eight possible combinations of target domains:
//
// case input target exec pack notes
// domain domain domain c?
// c+=a*b c+=a*b
// (0) r r r r r r r
// (1) r r c r r r r b demoted to real
// (2) r c r r r r r a demoted to real
// (3) r c c c c c c yes a*b demoted to real
// (4) c r r r r r r yes copynzm used to update c
// (5) c r c c r c r yes transposed to induce (6)
// (6) c c r c c r r ~ c and a treated as real
// (7) c c c c c c c
// ~ Must pack c if not column-stored (ie: row or general storage).
//
// There are two special cases: (5) and (6). Because the inner kernels
// assume column storage, it is easy to implement (6) since we can
// simply treat matrices c and a as real matrices with inflated m
// dimension and column stride and then proceed with a kernel for real
// computation. We cannot pull the same trick with case (5) because it
// would result in a mismatch in the k dimension. But we can transform
// case (5) into case (6) by transposing all arguments and swapping the
// a and b operands. Also, we will need to pack matrix c. That is what
// we do here.
if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
// An optimization: If C is row-stored, transpose the entire operation
// so as to allow the macro-kernel more favorable access patterns
// through C. (The effect of the transposition of A and B is negligible
// because those operands are always packed to contiguous memory.)
if ( bli_obj_is_row_stored( *c ) )
{
bli_obj_swap_pointers( a, b );
bli_swap_types( dt_targ_a, dt_targ_b );
bli_obj_toggle_trans( *c );
bli_obj_toggle_trans( *a );
bli_obj_toggle_trans( *b );
bli_obj_swap( a_local, b_local );
bli_obj_induce_trans( a_local );
bli_obj_induce_trans( b_local );
bli_obj_induce_trans( c_local );
}
// Create an object to hold a copy-cast of alpha. Notice that we use
// the target datatype of matrix a. By inspecting the table above,
// this clearly works for cases (0) through (4), (6), and (7). It
// Also works for case (5) since it is transformed into case (6) by
// the above code.
dt_alpha = dt_targ_a;
// Set the target and execution datatypes of the objects, and apply
// any transformations necessary to handle mixed domain computation.
bli_gemm_set_targ_exec_datatypes( &a_local,
&b_local,
&c_local,
&dt_alpha,
&dt_beta,
&pack_c );
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of c. Here's why: If c is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*c will not be stored. If c is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = bli_obj_datatype( *c );
// Create an object to hold a copy-cast of beta.
bli_obj_init_scalar_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// Choose the control tree based on whether it was determined we need
// to pack c.
//if ( pack_c ) gemm_cntl = gemm_cntl_packabc;
//else gemm_cntl = gemm_cntl_packab;
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Choose the control tree.
cntl = gemm_cntl;
if ( pack_c ) bli_abort();
// Invoke the internal back-end.
bli_gemm_int( &alpha_local,
a,
b,
&a_local,
&b_local,
&beta_local,
c,
&c_local,
cntl );
}

View File

@@ -75,8 +75,8 @@ void bli_gemm_basic_check( obj_t* alpha,
// We don't enforce general structure in matrix A so we can use gemm to
// implement hemm/symm. Instead, we only check this from the front-end.
e_val = bli_check_general_object( b );
bli_check_error_code( e_val );
//e_val = bli_check_general_object( b );
//bli_check_error_code( e_val );
e_val = bli_check_general_object( c );
bli_check_error_code( e_val );
@@ -99,6 +99,8 @@ void bli_gemm_check( obj_t* alpha,
e_val = bli_check_general_object( a );
bli_check_error_code( e_val );
e_val = bli_check_general_object( b );
bli_check_error_code( e_val );
}
void bli_gemm_int_check( obj_t* alpha,

View File

@@ -34,6 +34,99 @@
#include "blis.h"
void bli_gemm_set_targ_exec_datatypes( obj_t* a,
obj_t* b,
obj_t* c,
num_t* dt_alpha,
num_t* dt_beta,
bool_t* pack_c )
{
num_t dt_targ_a;
num_t dt_targ_b;
num_t dt_targ_c;
num_t dt_exec;
// Determine the target datatype of each matrix object.
bli_gemm_get_target_datatypes( a,
b,
c,
&dt_targ_a,
&dt_targ_b,
&dt_targ_c,
pack_c );
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_b, *b );
bli_obj_set_target_datatype( dt_targ_c, *c );
// Determine the execution datatype. Generally speaking, the
// execution datatype is the real projection of the target datatype
// of c. This rule holds unless the target datatypes of a and b
// are both complex, in which case the execution datatype is also
// complex.
if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
dt_exec = dt_targ_c;
else
dt_exec = bli_datatype_proj_to_real( dt_targ_c );
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, *b );
bli_obj_set_execution_datatype( dt_exec, *c );
// Note that the precisions of the target datatypes of a, b, and c
// match. The domains, however, are not necessarily the same. There
// are eight possible combinations of target domains:
//
// case input target exec pack notes
// domain domain domain c?
// c+=a*b c+=a*b
// (0) r r r r r r r
// (1) r r c r r r r b demoted to real
// (2) r c r r r r r a demoted to real
// (3) r c c c c c c yes a*b demoted to real
// (4) c r r r r r r yes copynzm used to update c
// (5) c r c c r c r yes transposed to induce (6)
// (6) c c r c c r r ~ c and a treated as real
// (7) c c c c c c c
// ~ Must pack c if not column-stored (ie: row or general storage).
//
// There are two special cases: (5) and (6). Because the inner kernels
// assume column storage, it is easy to implement (6) since we can
// simply treat matrices c and a as real matrices with inflated m
// dimension and column stride and then proceed with a kernel for real
// computation. We cannot pull the same trick with case (5) because it
// would result in a mismatch in the k dimension. But we can transform
// case (5) into case (6) by transposing all arguments and swapping the
// a and b operands. Also, we will need to pack matrix c. That is what
// we do here.
if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
{
bli_obj_swap( *a, *b );
bli_swap_types( dt_targ_a, dt_targ_b );
bli_obj_toggle_trans( *c );
bli_obj_toggle_trans( *a );
bli_obj_toggle_trans( *b );
}
// Notice that we use the target datatype of matrix a. By inspecting
// the table above, this clearly works for cases (0) through (4), (6),
// and (7). It also works for case (5) since it is transformed into
// case (6) by the above code.
*dt_alpha = bli_obj_target_datatype( *a );
// Notice that we use the target datatype of matrix a. By inspecting
// the table above, this clearly works for cases (0) through (4), (6),
// and (7). It also works for case (5) since it is transformed into
// case (6) by the above code.
*dt_beta = bli_obj_datatype( *c );
// For now disable packing of C.
*pack_c = FALSE;
}
void bli_gemm_get_target_datatypes( obj_t* a,
obj_t* b,
obj_t* c,

View File

@@ -32,6 +32,13 @@
*/
void bli_gemm_set_targ_exec_datatypes( obj_t* a,
obj_t* b,
obj_t* c,
num_t* dt_alpha,
num_t* dt_beta,
bool_t* pack_c );
void bli_gemm_get_target_datatypes( obj_t* a,
obj_t* b,
obj_t* c,

View File

@@ -52,11 +52,9 @@ void bli_hemm( side_t side,
obj_t a_local;
obj_t b_local;
obj_t c_local;
num_t dt_targ_a;
num_t dt_targ_b;
num_t dt_targ_c;
num_t dt_alpha;
num_t dt_beta;
bool_t pack_c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -69,47 +67,52 @@ void bli_hemm( side_t side,
return;
}
// Alias A and B in case we need to induce the right side case.
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *b, b_local );
bli_obj_alias_to( *c, c_local );
// For now, assume the storage datatypes are the desired target
// datatypes.
dt_targ_a = bli_obj_datatype( *a );
dt_targ_b = bli_obj_datatype( *b );
dt_targ_c = bli_obj_datatype( *c );
// We implement hemm in terms of gemm. But in order to do so we must make
// sure matrix A is on the correct side for our gemm kernel. We assume
// gemm is implemented with a block-panel kernel, thus, we will only
// directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by
// transposing the operation. Since A is Hermitian, we can mark it for
// conjugation instead of transpostion (though transposition should be
// correctly handled as well).
if ( bli_is_right( side ) )
// An optimization: If C is row-stored, transpose the entire operation
// so as to allow the macro-kernel more favorable access patterns
// through C.
if ( bli_obj_is_row_stored( *c ) )
{
bli_toggle_side( side );
bli_obj_toggle_conj( a_local );
bli_obj_toggle_trans( b_local );
bli_obj_toggle_trans( c_local );
bli_obj_induce_trans( b_local );
bli_obj_induce_trans( c_local );
}
// Create an object to hold a copy-cast of alpha. Notice that we use
// the target datatype of matrix A.
dt_alpha = dt_targ_a;
// Swap A and B if multiplying A from the right so that "B" contains
// the Hermitian matrix.
if ( bli_is_right( side ) )
{
bli_obj_swap( a_local, b_local );
}
// Set the target and execution datatypes of the objects, and apply
// any transformations necessary to handle mixed domain computation.
bli_gemm_set_targ_exec_datatypes( &a_local,
&b_local,
&c_local,
&dt_alpha,
&dt_beta,
&pack_c );
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of C.
dt_beta = bli_obj_datatype( *c );
// Create an object to hold a copy-cast of beta.
bli_obj_init_scalar_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Choose the control tree.
cntl = hemm_cntl;

View File

@@ -118,7 +118,8 @@ void bli_hemm_cntl_init()
hemm_kr,
hemm_nr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
//FALSE, // already dense; densify not necessary
TRUE, // densify (if needed)
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?

View File

@@ -51,15 +51,13 @@ void bli_her2k( obj_t* alpha,
obj_t alpha_conj_local;
obj_t beta_local;
obj_t c_local;
obj_t ah;
obj_t bh;
num_t dt_targ_a;
num_t dt_targ_b;
num_t dt_targ_c;
num_t dt_exec;
obj_t a_local;
obj_t bh_local;
obj_t b_local;
obj_t ah_local;
num_t dt_alpha;
num_t dt_beta;
//bool_t pack_c = FALSE;
bool_t pack_c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -72,84 +70,76 @@ void bli_her2k( obj_t* alpha,
return;
}
// Alias C so we can reset it as the root object (in case it is not
// already a root object).
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *b, b_local );
bli_obj_alias_to( *c, c_local );
bli_obj_set_as_root( c_local );
// Create objects to track A' and B' (for the second rank-k update).
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah );
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *b, bh );
// For her2k, the first and second right-hand "B" operands are simply B'
// and A'.
bli_obj_alias_to( *b, bh_local );
bli_obj_induce_trans( bh_local );
bli_obj_toggle_conj( bh_local );
bli_obj_alias_to( *a, ah_local );
bli_obj_induce_trans( ah_local );
bli_obj_toggle_conj( ah_local );
// Determine the target datatype of each matrix object.
//bli_her2k_get_target_datatypes( a,
// b,
// c,
// &dt_targ_a,
// &dt_targ_b,
// &dt_targ_c,
// &pack_c );
// An optimization: If C is row-stored, transpose the entire operation
// so as to allow the macro-kernel more favorable access patterns
// through C. (The effect of the transposition of A and A' is negligible
// because those operands are always packed to contiguous memory.)
if ( bli_obj_is_row_stored( c_local ) )
{
bli_obj_toggle_conj( a_local );
bli_obj_toggle_conj( bh_local );
bli_obj_toggle_conj( b_local );
bli_obj_toggle_conj( ah_local );
dt_targ_a = bli_obj_datatype( *a );
dt_targ_b = bli_obj_datatype( *b );
dt_targ_c = bli_obj_datatype( *c );
bli_obj_induce_trans( c_local );
}
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_b, *b );
bli_obj_set_target_datatype( dt_targ_c, *c );
// Set the target and execution datatypes of the objects, and apply
// any transformations necessary to handle mixed domain computation.
bli_her2k_set_targ_exec_datatypes( &a_local,
&bh_local,
&b_local,
&ah_local,
&c_local,
&dt_alpha,
&dt_beta,
&pack_c );
// Determine the execution datatype.
dt_exec = dt_targ_a;
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, *b );
bli_obj_set_execution_datatype( dt_exec, *c );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the target datatype of matrix a. By inspecting the table above,
// this clearly works for cases (0) through (4), (6), and (7). It
// Also works for case (5) since it is transformed into case (6) by
// the above code.
dt_alpha = dt_targ_a;
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of conj(alpha).
dt_alpha = dt_targ_b;
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_CONJUGATE,
alpha,
&alpha_conj_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of c. Here's why: If c is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*c will not be stored. If c is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = bli_obj_datatype( *c );
// Create an object to hold a copy-cast of beta.
bli_obj_init_scalar_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// Choose the control tree based on whether it was determined we need
// to pack c.
//if ( pack_c ) her2k_cntl = her2k_cntl_packabc;
//else her2k_cntl = her2k_cntl_packab;
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Choose the control tree.
cntl = her2k_cntl;
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Invoke the internal back-end.
bli_her2k_int( &alpha_local,
a,
&bh,
&a_local,
&bh_local,
&alpha_conj_local,
b,
&ah,
&b_local,
&ah_local,
&beta_local,
&c_local,
cntl );

View File

@@ -35,6 +35,7 @@
#include "bli_her2k_cntl.h"
#include "bli_her2k_check.h"
#include "bli_her2k_int.h"
#include "bli_her2k_target.h"
#include "bli_her2k_l_blk_var1.h"
#include "bli_her2k_u_blk_var1.h"

View File

@@ -0,0 +1,99 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_her2k_set_targ_exec_datatypes( obj_t* a,
obj_t* bh,
obj_t* b,
obj_t* ah,
obj_t* c,
num_t* dt_alpha,
num_t* dt_beta,
bool_t* pack_c )
{
num_t dt_targ_a;
num_t dt_targ_bh;
num_t dt_targ_b;
num_t dt_targ_ah;
num_t dt_targ_c;
num_t dt_exec;
// Determine the target datatype of each matrix object.
/*
bli_gemm_get_target_datatypes( a,
bh,
c,
&dt_targ_a,
&dt_targ_bh,
&dt_targ_c,
pack_c );
*/
dt_targ_a = bli_obj_datatype( *a );
dt_targ_bh = bli_obj_datatype( *bh );
dt_targ_b = bli_obj_datatype( *b );
dt_targ_ah = bli_obj_datatype( *ah );
dt_targ_c = bli_obj_datatype( *c );
dt_exec = dt_targ_a;
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_bh, *bh );
bli_obj_set_target_datatype( dt_targ_b, *b );
bli_obj_set_target_datatype( dt_targ_ah, *ah );
bli_obj_set_target_datatype( dt_targ_c, *c );
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, *bh );
bli_obj_set_execution_datatype( dt_exec, *b );
bli_obj_set_execution_datatype( dt_exec, *ah );
bli_obj_set_execution_datatype( dt_exec, *c );
// Notice that we use the target datatype of matrix a. By inspecting
// the table above, this clearly works for cases (0) through (4), (6),
// and (7). It also works for case (5) since it is transformed into
// case (6) by the above code.
*dt_alpha = bli_obj_target_datatype( *a );
// Notice that we use the target datatype of matrix a. By inspecting
// the table above, this clearly works for cases (0) through (4), (6),
// and (7). It also works for case (5) since it is transformed into
// case (6) by the above code.
*dt_beta = bli_obj_datatype( *c );
// For now disable packing of C.
*pack_c = FALSE;
}

View File

@@ -0,0 +1,43 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_her2k_set_targ_exec_datatypes( obj_t* a,
obj_t* bh,
obj_t* b,
obj_t* ah,
obj_t* c,
num_t* dt_alpha,
num_t* dt_beta,
bool_t* pack_c );

View File

@@ -47,15 +47,12 @@ void bli_herk( obj_t* alpha,
herk_t* cntl;
obj_t alpha_local;
obj_t beta_local;
obj_t a_local;
obj_t ah_local;
obj_t c_local;
obj_t ah;
num_t dt_targ_a;
num_t dt_targ_ah;
num_t dt_targ_c;
num_t dt_exec;
num_t dt_alpha;
num_t dt_beta;
//bool_t pack_c = FALSE;
bool_t pack_c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -68,84 +65,58 @@ void bli_herk( obj_t* alpha,
return;
}
// Alias C so we can reset it as the root object (in case it is not
// already a root object).
// Alias A and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *c, c_local );
bli_obj_set_as_root( c_local );
// For herk, the right-hand "B" operand is simply A'.
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah );
bli_obj_alias_to( *a, ah_local );
bli_obj_induce_trans( ah_local );
bli_obj_toggle_conj( ah_local );
// Determine the target datatype of each matrix object.
//bli_herk_get_target_datatypes( a,
// c,
// &dt_targ_a,
// &dt_targ_c,
// &pack_c );
dt_targ_a = bli_obj_datatype( *a );
dt_targ_ah = bli_obj_datatype( *a );
dt_targ_c = bli_obj_datatype( *c );
// An optimization: If C is row-stored, transpose the entire operation
// so as to allow the macro-kernel more favorable access patterns
// through C. (The effect of the transposition of A and A' is negligible
// because those operands are always packed to contiguous memory.)
if ( bli_obj_is_row_stored( c_local ) )
{
bli_obj_toggle_conj( a_local );
bli_obj_toggle_conj( ah_local );
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_ah, ah );
bli_obj_set_target_datatype( dt_targ_c, *c );
bli_obj_induce_trans( c_local );
}
// Determine the execution datatype. For herk, the execution
// datatype is always the target datatype of a.
dt_exec = dt_targ_a;
// Set the target and execution datatypes of the objects, and apply
// any transformations necessary to handle mixed domain computation.
bli_herk_set_targ_exec_datatypes( &a_local,
&ah_local,
&c_local,
&dt_alpha,
&dt_beta,
&pack_c );
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, ah );
bli_obj_set_execution_datatype( dt_exec, *c );
// Note that the precisions of the target datatypes of a and c
// match. The domains, however, are not necessarily the same. There
// are four possible combinations of target domains:
//
// case input target exec pack notes
// domain domain domain c?
// c+=a*a' c+=a*a'
// (0) r r r r r r r
// (1) r c c c c c c yes a*a' demoted to real
// (2) c r r r r r r yes copynzm used to update c
// (3) c c c c c c c
// Create an object to hold a copy-cast of alpha. Notice that we use
// the target datatype of matrix a. By inspecting the table above,
// this clearly works for cases (0) through (4), (6), and (7). It
// Also works for case (5) since it is transformed into case (6) by
// the above code.
dt_alpha = dt_targ_a;
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of c. Here's why: If c is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*c will not be stored. If c is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = bli_obj_datatype( *c );
// Create an object to hold a copy-cast of beta.
bli_obj_init_scalar_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// Choose the control tree based on whether it was determined we need
// to pack c.
//if ( pack_c ) herk_cntl = herk_cntl_packabc;
//else herk_cntl = herk_cntl_packab;
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Choose the control tree.
cntl = herk_cntl;
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Invoke the internal back-end.
bli_herk_int( &alpha_local,
a,
&ah,
&a_local,
&ah_local,
&beta_local,
&c_local,
cntl );

View File

@@ -35,6 +35,7 @@
#include "bli_herk_cntl.h"
#include "bli_herk_check.h"
#include "bli_herk_int.h"
#include "bli_herk_target.h"
#include "bli_herk_l_blk_var1.h"
#include "bli_herk_u_blk_var1.h"

View File

@@ -0,0 +1,203 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_herk_set_targ_exec_datatypes( obj_t* a,
obj_t* ah,
obj_t* c,
num_t* dt_alpha,
num_t* dt_beta,
bool_t* pack_c )
{
num_t dt_targ_a;
num_t dt_targ_ah;
num_t dt_targ_c;
num_t dt_exec;
// Determine the target datatype of each matrix object.
/*
bli_herk_get_target_datatypes( a,
c,
&dt_targ_a,
&dt_targ_c,
pack_c );
*/
dt_targ_a = bli_obj_datatype( *a );
dt_targ_ah = bli_obj_datatype( *ah );
dt_targ_c = bli_obj_datatype( *c );
dt_exec = dt_targ_a;
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_ah, *ah );
bli_obj_set_target_datatype( dt_targ_c, *c );
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, *ah );
bli_obj_set_execution_datatype( dt_exec, *c );
*dt_alpha = bli_obj_target_datatype( *a );
*dt_beta = bli_obj_datatype( *c );
// For now disable packing of C.
*pack_c = FALSE;
}
/*
void bli_herk_get_target_datatypes( obj_t* a,
obj_t* c,
num_t* dt_a,
num_t* dt_c,
bool_t* pack_c )
{
prec_t tp_a, tp_c;
dom_t td_a, td_c;
// Determine the target domains for each object.
bli_herk_get_target_domain( a,
c,
&td_a,
&td_c,
pack_c );
// Determine the target precisions for each object.
bli_herk_get_target_prec( a,
c,
&tp_a,
&tp_c,
pack_c );
// The target datatype of an object is simply the union of its
// target domain and target precision.
*dt_a = td_a | tp_a;
*dt_c = td_c | tp_c;
}
*/
/*
void bli_herk_get_target_domain( obj_t* a,
obj_t* c,
dom_t* td_a,
dom_t* td_c,
bool_t* pack_c )
{
dom_t d_a = bli_obj_domain( *a );
dom_t d_c = bli_obj_domain( *c );
// Note that the precisions of the target datatypes of a and c
// match. The domains, however, are not necessarily the same. There
// are four possible combinations of target domains:
//
// case input target exec pack notes
// domain domain domain c?
// c+=a*a' c+=a*a'
// (0) r r r r r r r
// (1) r c c c c c c yes a*a' demoted to real
// (2) c r r r r r r yes copynzm used to update c
// (3) c c c c c c c
if ( bli_is_real( d_c ) )
{
if ( bli_is_complex( d_a ) )
{
*td_c = *td_a = BLIS_COMPLEX;
*pack_c = TRUE;
}
else
{
*td_c = *td_a = BLIS_REAL;
}
}
else // if ( bli_is_complex( d_c ) )
{
*td_a = d_a;
if ( bli_is_real( d_a ) )
{
*td_c = BLIS_REAL;
*pack_c = TRUE;
}
else
{
*td_c = BLIS_COMPLEX;
if ( bli_obj_is_real( *a ) ) *pack_c = TRUE;
if ( bli_obj_is_complex( *a ) )
{
if ( !bli_obj_is_col_stored( *c ) ) *pack_c = TRUE;
}
}
}
}
*/
/*
void bli_herk_get_target_prec( obj_t* a,
obj_t* c,
prec_t* tp_a,
prec_t* tp_c,
bool_t* pack_c )
{
prec_t p_a = bli_obj_precision( *a );
prec_t p_c = bli_obj_precision( *c );
if ( bli_is_single_prec( p_c ) )
{
if ( bli_is_double_prec( p_a ) )
{
*tp_c = *tp_a = BLIS_DOUBLE_PREC;
*pack_c = TRUE;
}
else
{
*tp_c = *tp_a = BLIS_SINGLE_PREC;
}
}
else // if ( bli_is_double_prec( p_c ) )
{
if ( bli_is_single_prec( p_a ) )
{
*tp_c = *tp_a = BLIS_SINGLE_PREC;
*pack_c = TRUE;
}
else
{
*tp_c = *tp_a = BLIS_DOUBLE_PREC;
}
}
}
*/

View File

@@ -0,0 +1,66 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_herk_set_targ_exec_datatypes( obj_t* a,
obj_t* ah,
obj_t* c,
num_t* dt_alpha,
num_t* dt_beta,
bool_t* pack_c );
/*
void bli_herk_get_target_datatypes( obj_t* a,
obj_t* b,
obj_t* c,
num_t* dt_a,
num_t* dt_b,
num_t* dt_c,
bool_t* pack_c );
void bli_herk_get_target_domain( obj_t* a,
obj_t* b,
obj_t* c,
dom_t* td_a,
dom_t* td_b,
dom_t* td_c,
bool_t* pack_c );
void bli_herk_get_target_prec( obj_t* a,
obj_t* b,
obj_t* c,
prec_t* tp_a,
prec_t* tp_b,
prec_t* tp_c,
bool_t* pack_c );
*/

View File

@@ -52,11 +52,9 @@ void bli_symm( side_t side,
obj_t a_local;
obj_t b_local;
obj_t c_local;
num_t dt_targ_a;
num_t dt_targ_b;
num_t dt_targ_c;
num_t dt_alpha;
num_t dt_beta;
bool_t pack_c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -69,45 +67,51 @@ void bli_symm( side_t side,
return;
}
// For now, assume the storage datatypes are the desired target
// datatypes.
dt_targ_a = bli_obj_datatype( *a );
dt_targ_b = bli_obj_datatype( *b );
dt_targ_c = bli_obj_datatype( *c );
// Alias A and B in case we need to induce the right side case.
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *b, b_local );
bli_obj_alias_to( *c, c_local );
// We implement symm in terms of gemm. But in order to do so we must make
// sure matrix A is on the correct side for our gemm kernel. We assume
// gemm is implemented with a block-panel kernel, thus, we will only
// directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by
// transposing the operation. Since A is symmetric, we do not mark it
// for any conjugation or transposition.
if ( bli_is_right( side ) )
// An optimization: If C is row-stored, transpose the entire operation
// so as to allow the macro-kernel more favorable access patterns
// through C.
if ( bli_obj_is_row_stored( *c ) )
{
bli_obj_toggle_trans( b_local );
bli_obj_toggle_trans( c_local );
bli_toggle_side( side );
bli_obj_induce_trans( b_local );
bli_obj_induce_trans( c_local );
}
// Create an object to hold a copy-cast of alpha. Notice that we use
// the target datatype of matrix A.
dt_alpha = dt_targ_a;
// Swap A and B if multiplying A from the right so that "B" contains
// the symmetric matrix.
if ( bli_is_right( side ) )
{
bli_obj_swap( a_local, b_local );
}
// Set the target and execution datatypes of the objects, and apply
// any transformations necessary to handle mixed domain computation.
bli_gemm_set_targ_exec_datatypes( &a_local,
&b_local,
&c_local,
&dt_alpha,
&dt_beta,
&pack_c );
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of C.
dt_beta = bli_obj_datatype( *c );
// Create an object to hold a copy-cast of beta.
bli_obj_init_scalar_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Choose the control tree. We can just use hemm since the algorithm
// is nearly identical to that of symm.
cntl = hemm_cntl;

View File

@@ -50,15 +50,13 @@ void bli_syr2k( obj_t* alpha,
obj_t alpha_local;
obj_t beta_local;
obj_t c_local;
obj_t at;
obj_t bt;
num_t dt_targ_a;
num_t dt_targ_b;
num_t dt_targ_c;
num_t dt_exec;
obj_t a_local;
obj_t bt_local;
obj_t b_local;
obj_t at_local;
num_t dt_alpha;
num_t dt_beta;
//bool_t pack_c = FALSE;
bool_t pack_c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -71,69 +69,52 @@ void bli_syr2k( obj_t* alpha,
return;
}
// Alias C so we can reset it as the root object (in case it is not
// already a root object).
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *b, b_local );
bli_obj_alias_to( *c, c_local );
bli_obj_set_as_root( c_local );
// Create objects to track A^T and B^T (for the second rank-k update).
bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at );
bli_obj_alias_with_trans( BLIS_TRANSPOSE, *b, bt );
// For syr2k, the first and second right-hand "B" operands are simply B'
// and A'.
bli_obj_alias_to( *b, bt_local );
bli_obj_induce_trans( bt_local );
bli_obj_alias_to( *a, at_local );
bli_obj_induce_trans( at_local );
// Determine the target datatype of each matrix object.
//bli_syr2k_get_target_datatypes( a,
// b,
// c,
// &dt_targ_a,
// &dt_targ_b,
// &dt_targ_c,
// &pack_c );
// An optimization: If C is row-stored, transpose the entire operation
// so as to allow the macro-kernel more favorable access patterns
// through C. (The effect of the transposition of A and A' is negligible
// because those operands are always packed to contiguous memory.)
if ( bli_obj_is_row_stored( c_local ) )
{
bli_obj_induce_trans( c_local );
}
dt_targ_a = bli_obj_datatype( *a );
dt_targ_b = bli_obj_datatype( *b );
dt_targ_c = bli_obj_datatype( *c );
// Set the target and execution datatypes of the objects, and apply
// any transformations necessary to handle mixed domain computation.
bli_her2k_set_targ_exec_datatypes( &a_local,
&bt_local,
&b_local,
&at_local,
&c_local,
&dt_alpha,
&dt_beta,
&pack_c );
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_b, *b );
bli_obj_set_target_datatype( dt_targ_c, *c );
// Determine the execution datatype.
dt_exec = dt_targ_a;
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, *b );
bli_obj_set_execution_datatype( dt_exec, *c );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the target datatype of matrix a. By inspecting the table above,
// this clearly works for cases (0) through (4), (6), and (7). It
// Also works for case (5) since it is transformed into case (6) by
// the above code.
dt_alpha = dt_targ_a;
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of c. Here's why: If c is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*c will not be stored. If c is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = bli_obj_datatype( *c );
// Create an object to hold a copy-cast of beta.
bli_obj_init_scalar_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// Choose the control tree based on whether it was determined we need
// to pack c.
//if ( pack_c ) syr2k_cntl = her2k_cntl_packabc;
//else syr2k_cntl = her2k_cntl_packab;
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Choose the control tree. We can just use her2k since the algorithm
// is nearly identical to that of syr2k.
@@ -141,11 +122,11 @@ void bli_syr2k( obj_t* alpha,
// Invoke the internal back-end.
bli_her2k_int( &alpha_local,
a,
&bt,
&a_local,
&bt_local,
&alpha_local,
b,
&at,
&b_local,
&at_local,
&beta_local,
&c_local,
cntl );

View File

@@ -47,15 +47,12 @@ void bli_syrk( obj_t* alpha,
herk_t* cntl;
obj_t alpha_local;
obj_t beta_local;
obj_t a_local;
obj_t at_local;
obj_t c_local;
obj_t at;
num_t dt_targ_a;
num_t dt_targ_at;
num_t dt_targ_c;
num_t dt_exec;
num_t dt_alpha;
num_t dt_beta;
//bool_t pack_c = FALSE;
bool_t pack_c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -68,78 +65,46 @@ void bli_syrk( obj_t* alpha,
return;
}
// Alias C so we can reset it as the root object (in case it is not
// already a root object).
// Alias A and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *c, c_local );
bli_obj_set_as_root( c_local );
// For syrk, the right-hand "B" operand is simply A^T.
bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at );
// For herk, the right-hand "B" operand is simply A^T.
bli_obj_alias_to( *a, at_local );
bli_obj_induce_trans( at_local );
// Determine the target datatype of each matrix object.
//bli_syrk_get_target_datatypes( a,
// c,
// &dt_targ_a,
// &dt_targ_c,
// &pack_c );
dt_targ_a = bli_obj_datatype( *a );
dt_targ_at = bli_obj_datatype( *a );
dt_targ_c = bli_obj_datatype( *c );
// An optimization: If C is row-stored, transpose the entire operation
// so as to allow the macro-kernel more favorable access patterns
// through C. (The effect of the transposition of A and A^T is negligible
// because those operands are always packed to contiguous memory.)
if ( bli_obj_is_row_stored( c_local ) )
{
bli_obj_induce_trans( c_local );
}
// Set the target datatypes for each matrix object.
bli_obj_set_target_datatype( dt_targ_a, *a );
bli_obj_set_target_datatype( dt_targ_at, at );
bli_obj_set_target_datatype( dt_targ_c, *c );
// Set the target and execution datatypes of the objects, and apply
// any transformations necessary to handle mixed domain computation.
bli_herk_set_targ_exec_datatypes( &a_local,
&at_local,
&c_local,
&dt_alpha,
&dt_beta,
&pack_c );
// Determine the execution datatype. For syrk, the execution
// datatype is always the target datatype of a.
dt_exec = dt_targ_a;
// Embed the execution datatype in all matrix operands.
bli_obj_set_execution_datatype( dt_exec, *a );
bli_obj_set_execution_datatype( dt_exec, at );
bli_obj_set_execution_datatype( dt_exec, *c );
// Note that the precisions of the target datatypes of a and c
// match. The domains, however, are not necessarily the same. There
// are four possible combinations of target domains:
//
// case input target exec pack notes
// domain domain domain c?
// c+=a*a' c+=a*a'
// (0) r r r r r r r
// (1) r c c c c c c yes a*a^T demoted to real
// (2) c r r r r r r yes copynzm used to update c
// (3) c c c c c c c
// Create an object to hold a copy-cast of alpha. Notice that we use
// the target datatype of matrix a. By inspecting the table above,
// this clearly works for cases (0) through (4), (6), and (7). It
// Also works for case (5) since it is transformed into case (6) by
// the above code.
dt_alpha = dt_targ_a;
// Create an object to hold a copy-cast of alpha.
bli_obj_init_scalar_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of c. Here's why: If c is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*c will not be stored. If c is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = bli_obj_datatype( *c );
// Create an object to hold a copy-cast of beta.
bli_obj_init_scalar_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// Choose the control tree based on whether it was determined we need
// to pack c.
//if ( pack_c ) syrk_cntl = herk_cntl_packabc;
//else syrk_cntl = herk_cntl_packab;
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Choose the control tree. We can just use herk since the algorithm
// is nearly identical to that of syrk.
@@ -147,8 +112,8 @@ void bli_syrk( obj_t* alpha,
// Invoke the internal back-end.
bli_herk_int( &alpha_local,
a,
&at,
&a_local,
&at_local,
&beta_local,
&c_local,
cntl );

View File

@@ -894,6 +894,16 @@ bli_obj_width_stored( obj )
: ( bli_obj_buffer_at_off( obj ) ) \
)
// Swap objects
#define bli_obj_swap( a, b ) \
{ \
obj_t t; \
t = b; b = a; a = t; \
}
// Swap object pointers
#define bli_obj_swap_pointers( a, b ) \
@@ -902,6 +912,7 @@ bli_obj_width_stored( obj )
t = b; b = a; a = t; \
}
// If a transposition is needed, induce one: swap dimensions, increments
// and offsets, and then clear the trans bit.

View File

@@ -127,13 +127,13 @@
// min, max, abs
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) )
#define bli_abs( a ) ( (a) < 0 ? -(a) : (a) )
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) )
#define bli_abs( a ) ( (a) < 0 ? -(a) : (a) )
// fmin, fmax, fabs
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
#define bli_fmin( a, b ) bli_min( a, b )
#define bli_fmax( a, b ) bli_max( a, b )
#define bli_fabs( a ) ( (a) < 0.0 ? -(a) : (a) )