mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Various level-3 optimizations for row storage.
Details: - Implemented remaining two cases within bli_packm_blk_var2(), which allow packing from a lower or upper-stored symmetric/Hermitian matrix to column panels (which are row-stored). Previously one could only pack to row panels (which are column-stored). - Implemented various optimizations in the level-3 front-ends that allow more favorable access through row-stored matrices for gemm, hemm, herk, her2k, symm, syrk, and syr2k. - Cleaned up code in level-3 front-ends that has to do with setting target and execution datatypes.
This commit is contained in:
@@ -143,6 +143,7 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t panel_len; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
doff_t diagoffc_i_abs; \
|
||||
dim_t panel_dim_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t incc, ldc; \
|
||||
@@ -171,6 +172,7 @@ void PASTEMAC(ch,varname )( \
|
||||
ctype* restrict p11; \
|
||||
dim_t p11_m; \
|
||||
dim_t p11_n; \
|
||||
inc_t rs_p11, cs_p11; \
|
||||
\
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
@@ -191,7 +193,7 @@ void PASTEMAC(ch,varname )( \
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to column panels. */ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len = m; \
|
||||
panel_dim = pd_p; \
|
||||
@@ -204,10 +206,12 @@ void PASTEMAC(ch,varname )( \
|
||||
n_panel = &panel_dim_i; \
|
||||
m_panel_max = m_max; \
|
||||
n_panel_max = panel_dim; \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to row panels. */ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len = n; \
|
||||
panel_dim = pd_p; \
|
||||
@@ -220,6 +224,8 @@ void PASTEMAC(ch,varname )( \
|
||||
n_panel = &n; \
|
||||
m_panel_max = panel_dim; \
|
||||
n_panel_max = n_max; \
|
||||
rs_p11 = 1; \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
@@ -237,11 +243,11 @@ void PASTEMAC(ch,varname )( \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \
|
||||
panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
p_begin = p_cast + (ip )*ps_p; \
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
p_begin = p_cast + (ip )*ps_p; \
|
||||
\
|
||||
/* If the current panel intersects the diagonal and C is either
|
||||
upper- or lower-stored, then we assume C is symmetric or
|
||||
@@ -252,17 +258,76 @@ void PASTEMAC(ch,varname )( \
|
||||
if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
|
||||
bli_is_upper_or_lower( uploc ) ) \
|
||||
{ \
|
||||
/* Only two of four cases implemented, since BLIS2 currently does
|
||||
not support triangular packing of matrix B (which is row-stored). */ \
|
||||
/*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
|
||||
diagoffc_i_abs = bli_abs( diagoffc_i ); \
|
||||
\
|
||||
/*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) */ \
|
||||
if ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
p10_dim = panel_dim_i; \
|
||||
p10_len = diagoffc_i_abs; \
|
||||
p10 = p_begin; \
|
||||
c10 = c_begin; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim_i; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_i_abs - j; \
|
||||
p12 = p_begin + (j )*ldp; \
|
||||
c12 = c_begin + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim_i; \
|
||||
p11_n = panel_dim_i; \
|
||||
j = diagoffc_i_abs; \
|
||||
p11 = p_begin + (j )*ldp; \
|
||||
c11 = c_begin + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \
|
||||
/*else if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \
|
||||
else /* if ( ( bli_is_row_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
p10_dim = panel_dim_i; \
|
||||
p10_len = diagoffc_i_abs + panel_dim_i; \
|
||||
diagoffc10 = diagoffc_i; \
|
||||
p10 = p_begin; \
|
||||
c10 = c_begin; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim_i; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p_begin + (j )*ldp; \
|
||||
c12 = c_begin + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim_i; \
|
||||
p11_n = panel_dim_i; \
|
||||
j = diagoffc_i_abs; \
|
||||
p11 = p_begin + (j )*ldp; \
|
||||
c11 = c_begin + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
else*/ if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
|
||||
/*
|
||||
else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim_i; \
|
||||
p10_len = diagoffc_i + panel_dim_i; \
|
||||
@@ -279,7 +344,7 @@ void PASTEMAC(ch,varname )( \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p_begin + (j )*ldp; \
|
||||
c12 = c_begin + (j )*cs_c; \
|
||||
c12 = c_begin + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
@@ -288,12 +353,12 @@ void PASTEMAC(ch,varname )( \
|
||||
p11_n = panel_dim_i; \
|
||||
j = diagoffc_i; \
|
||||
p11 = p_begin + (j )*ldp; \
|
||||
c11 = c_begin + (j )*cs_c; \
|
||||
c11 = c_begin + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \
|
||||
else if ( bli_is_col_stored( rs_p, cs_p ) && bli_is_lower( uploc ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim_i; \
|
||||
p10_len = diagoffc_i; \
|
||||
@@ -308,7 +373,7 @@ void PASTEMAC(ch,varname )( \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_i - j; \
|
||||
p12 = p_begin + (j )*ldp; \
|
||||
c12 = c_begin + (j )*cs_c; \
|
||||
c12 = c_begin + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
@@ -319,11 +384,12 @@ void PASTEMAC(ch,varname )( \
|
||||
p11_n = panel_dim_i; \
|
||||
j = diagoffc_i; \
|
||||
p11 = p_begin + (j )*ldp; \
|
||||
c11 = c_begin + (j )*cs_c; \
|
||||
c11 = c_begin + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
/* Pack to P10. For upper storage, this includes the unstored
|
||||
triangle of C11. */ \
|
||||
@@ -351,11 +417,16 @@ void PASTEMAC(ch,varname )( \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, 1, ldp ); \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Note that the following code executes if the current panel either:
|
||||
- does not intersect the diagonal, or
|
||||
- does intersect the diagonal, BUT the matrix is general
|
||||
which means the entire current panel can be copied at once. */ \
|
||||
\
|
||||
/* We use some c10-specific variables here because we might need
|
||||
to change them if the current panel is unstored. (The values
|
||||
below are used if the current panel is stored.) */ \
|
||||
@@ -368,8 +439,9 @@ void PASTEMAC(ch,varname )( \
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, and so we take conjugation into account. (Note
|
||||
this implicitly assumes we are operating on a symmetric or
|
||||
Hermitian matrix.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len ) ) \
|
||||
Hermitian matrix, since a general matrix would not contain
|
||||
any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \
|
||||
{ \
|
||||
c10 = c10 + diagoffc_i * ( doff_t )cs_c + \
|
||||
-diagoffc_i * ( doff_t )rs_c; \
|
||||
@@ -440,7 +512,7 @@ void PASTEMAC(ch,varname )( \
|
||||
p_begin, 1, cs_p, "%4.1f", "" ); \
|
||||
if ( cs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \
|
||||
p_begin, panel_dim, 1, "%6.3f", "" ); \
|
||||
p_begin, panel_dim, 1, "%8.5f", "" ); \
|
||||
*/ \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -271,7 +271,7 @@ void PASTEMAC(ch,varname )( \
|
||||
} \
|
||||
else if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) ) \
|
||||
{ \
|
||||
/* Only two of four cases implemented, since BLIS2 currently does
|
||||
/* Only two of four cases implemented, since BLIS currently does
|
||||
not support triangular packing of matrix B. */ \
|
||||
/*if ( bli_is_row_stored( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
|
||||
{ \
|
||||
|
||||
@@ -48,13 +48,12 @@ void bli_gemm( obj_t* alpha,
|
||||
gemm_t* cntl;
|
||||
obj_t alpha_local;
|
||||
obj_t beta_local;
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_b;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
num_t dt_alpha;
|
||||
num_t dt_beta;
|
||||
bool_t pack_c = FALSE;
|
||||
bool_t pack_c;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
@@ -67,105 +66,56 @@ void bli_gemm( obj_t* alpha,
|
||||
return;
|
||||
}
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
bli_gemm_get_target_datatypes( a,
|
||||
b,
|
||||
c,
|
||||
&dt_targ_a,
|
||||
&dt_targ_b,
|
||||
&dt_targ_c,
|
||||
&pack_c );
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_b, *b );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *b, b_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
// Determine the execution datatype. Generally speaking, the
|
||||
// execution datatype is the real projection of the target datatype
|
||||
// of c. This rule holds unless the target datatypes of a and b
|
||||
// are both complex, in which case the execution datatype is also
|
||||
// complex.
|
||||
if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
|
||||
dt_exec = dt_targ_c;
|
||||
else
|
||||
dt_exec = bli_datatype_proj_to_real( dt_targ_c );
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, *b );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
// Note that the precisions of the target datatypes of a, b, and c
|
||||
// match. The domains, however, are not necessarily the same. There
|
||||
// are eight possible combinations of target domains:
|
||||
//
|
||||
// case input target exec pack notes
|
||||
// domain domain domain c?
|
||||
// c+=a*b c+=a*b
|
||||
// (0) r r r r r r r
|
||||
// (1) r r c r r r r b demoted to real
|
||||
// (2) r c r r r r r a demoted to real
|
||||
// (3) r c c c c c c yes a*b demoted to real
|
||||
// (4) c r r r r r r yes copynzm used to update c
|
||||
// (5) c r c c r c r yes transposed to induce (6)
|
||||
// (6) c c r c c r r ~ c and a treated as real
|
||||
// (7) c c c c c c c
|
||||
// ~ Must pack c if not column-stored (ie: row or general storage).
|
||||
//
|
||||
// There are two special cases: (5) and (6). Because the inner kernels
|
||||
// assume column storage, it is easy to implement (6) since we can
|
||||
// simply treat matrices c and a as real matrices with inflated m
|
||||
// dimension and column stride and then proceed with a kernel for real
|
||||
// computation. We cannot pull the same trick with case (5) because it
|
||||
// would result in a mismatch in the k dimension. But we can transform
|
||||
// case (5) into case (6) by transposing all arguments and swapping the
|
||||
// a and b operands. Also, we will need to pack matrix c. That is what
|
||||
// we do here.
|
||||
if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
|
||||
// An optimization: If C is row-stored, transpose the entire operation
|
||||
// so as to allow the macro-kernel more favorable access patterns
|
||||
// through C. (The effect of the transposition of A and B is negligible
|
||||
// because those operands are always packed to contiguous memory.)
|
||||
if ( bli_obj_is_row_stored( *c ) )
|
||||
{
|
||||
bli_obj_swap_pointers( a, b );
|
||||
bli_swap_types( dt_targ_a, dt_targ_b );
|
||||
bli_obj_toggle_trans( *c );
|
||||
bli_obj_toggle_trans( *a );
|
||||
bli_obj_toggle_trans( *b );
|
||||
bli_obj_swap( a_local, b_local );
|
||||
|
||||
bli_obj_induce_trans( a_local );
|
||||
bli_obj_induce_trans( b_local );
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Create an object to hold a copy-cast of alpha. Notice that we use
|
||||
// the target datatype of matrix a. By inspecting the table above,
|
||||
// this clearly works for cases (0) through (4), (6), and (7). It
|
||||
// Also works for case (5) since it is transformed into case (6) by
|
||||
// the above code.
|
||||
dt_alpha = dt_targ_a;
|
||||
// Set the target and execution datatypes of the objects, and apply
|
||||
// any transformations necessary to handle mixed domain computation.
|
||||
bli_gemm_set_targ_exec_datatypes( &a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
&dt_alpha,
|
||||
&dt_beta,
|
||||
&pack_c );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
// Create an object to hold a copy-cast of beta. Notice that we use
|
||||
// the datatype of c. Here's why: If c is real and beta is complex,
|
||||
// there is no reason to keep beta_local in the complex domain since
|
||||
// the complex part of beta*c will not be stored. If c is complex and
|
||||
// beta is real then beta is harmlessly promoted to complex.
|
||||
dt_beta = bli_obj_datatype( *c );
|
||||
// Create an object to hold a copy-cast of beta.
|
||||
bli_obj_init_scalar_copy_of( dt_beta,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
// Choose the control tree based on whether it was determined we need
|
||||
// to pack c.
|
||||
//if ( pack_c ) gemm_cntl = gemm_cntl_packabc;
|
||||
//else gemm_cntl = gemm_cntl_packab;
|
||||
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Choose the control tree.
|
||||
cntl = gemm_cntl;
|
||||
if ( pack_c ) bli_abort();
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_gemm_int( &alpha_local,
|
||||
a,
|
||||
b,
|
||||
&a_local,
|
||||
&b_local,
|
||||
&beta_local,
|
||||
c,
|
||||
&c_local,
|
||||
cntl );
|
||||
}
|
||||
|
||||
|
||||
@@ -75,8 +75,8 @@ void bli_gemm_basic_check( obj_t* alpha,
|
||||
// We don't enforce general structure in matrix A so we can use gemm to
|
||||
// implement hemm/symm. Instead, we only check this from the front-end.
|
||||
|
||||
e_val = bli_check_general_object( b );
|
||||
bli_check_error_code( e_val );
|
||||
//e_val = bli_check_general_object( b );
|
||||
//bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_general_object( c );
|
||||
bli_check_error_code( e_val );
|
||||
@@ -99,6 +99,8 @@ void bli_gemm_check( obj_t* alpha,
|
||||
e_val = bli_check_general_object( a );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_general_object( b );
|
||||
bli_check_error_code( e_val );
|
||||
}
|
||||
|
||||
void bli_gemm_int_check( obj_t* alpha,
|
||||
|
||||
@@ -34,6 +34,99 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_set_targ_exec_datatypes( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
num_t* dt_alpha,
|
||||
num_t* dt_beta,
|
||||
bool_t* pack_c )
|
||||
{
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_b;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
bli_gemm_get_target_datatypes( a,
|
||||
b,
|
||||
c,
|
||||
&dt_targ_a,
|
||||
&dt_targ_b,
|
||||
&dt_targ_c,
|
||||
pack_c );
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_b, *b );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
|
||||
// Determine the execution datatype. Generally speaking, the
|
||||
// execution datatype is the real projection of the target datatype
|
||||
// of c. This rule holds unless the target datatypes of a and b
|
||||
// are both complex, in which case the execution datatype is also
|
||||
// complex.
|
||||
if ( bli_is_complex( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
|
||||
dt_exec = dt_targ_c;
|
||||
else
|
||||
dt_exec = bli_datatype_proj_to_real( dt_targ_c );
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, *b );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
// Note that the precisions of the target datatypes of a, b, and c
|
||||
// match. The domains, however, are not necessarily the same. There
|
||||
// are eight possible combinations of target domains:
|
||||
//
|
||||
// case input target exec pack notes
|
||||
// domain domain domain c?
|
||||
// c+=a*b c+=a*b
|
||||
// (0) r r r r r r r
|
||||
// (1) r r c r r r r b demoted to real
|
||||
// (2) r c r r r r r a demoted to real
|
||||
// (3) r c c c c c c yes a*b demoted to real
|
||||
// (4) c r r r r r r yes copynzm used to update c
|
||||
// (5) c r c c r c r yes transposed to induce (6)
|
||||
// (6) c c r c c r r ~ c and a treated as real
|
||||
// (7) c c c c c c c
|
||||
// ~ Must pack c if not column-stored (ie: row or general storage).
|
||||
//
|
||||
// There are two special cases: (5) and (6). Because the inner kernels
|
||||
// assume column storage, it is easy to implement (6) since we can
|
||||
// simply treat matrices c and a as real matrices with inflated m
|
||||
// dimension and column stride and then proceed with a kernel for real
|
||||
// computation. We cannot pull the same trick with case (5) because it
|
||||
// would result in a mismatch in the k dimension. But we can transform
|
||||
// case (5) into case (6) by transposing all arguments and swapping the
|
||||
// a and b operands. Also, we will need to pack matrix c. That is what
|
||||
// we do here.
|
||||
if ( bli_is_real( dt_targ_a ) && bli_is_complex( dt_targ_b ) )
|
||||
{
|
||||
bli_obj_swap( *a, *b );
|
||||
bli_swap_types( dt_targ_a, dt_targ_b );
|
||||
bli_obj_toggle_trans( *c );
|
||||
bli_obj_toggle_trans( *a );
|
||||
bli_obj_toggle_trans( *b );
|
||||
}
|
||||
|
||||
// Notice that we use the target datatype of matrix a. By inspecting
|
||||
// the table above, this clearly works for cases (0) through (4), (6),
|
||||
// and (7). It also works for case (5) since it is transformed into
|
||||
// case (6) by the above code.
|
||||
*dt_alpha = bli_obj_target_datatype( *a );
|
||||
|
||||
// Notice that we use the target datatype of matrix a. By inspecting
|
||||
// the table above, this clearly works for cases (0) through (4), (6),
|
||||
// and (7). It also works for case (5) since it is transformed into
|
||||
// case (6) by the above code.
|
||||
*dt_beta = bli_obj_datatype( *c );
|
||||
|
||||
// For now disable packing of C.
|
||||
*pack_c = FALSE;
|
||||
}
|
||||
|
||||
|
||||
void bli_gemm_get_target_datatypes( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
|
||||
@@ -32,6 +32,13 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemm_set_targ_exec_datatypes( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
num_t* dt_alpha,
|
||||
num_t* dt_beta,
|
||||
bool_t* pack_c );
|
||||
|
||||
void bli_gemm_get_target_datatypes( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
|
||||
@@ -52,11 +52,9 @@ void bli_hemm( side_t side,
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_b;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_alpha;
|
||||
num_t dt_beta;
|
||||
bool_t pack_c;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
@@ -69,47 +67,52 @@ void bli_hemm( side_t side,
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A and B in case we need to induce the right side case.
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *b, b_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
// For now, assume the storage datatypes are the desired target
|
||||
// datatypes.
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_b = bli_obj_datatype( *b );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
|
||||
// We implement hemm in terms of gemm. But in order to do so we must make
|
||||
// sure matrix A is on the correct side for our gemm kernel. We assume
|
||||
// gemm is implemented with a block-panel kernel, thus, we will only
|
||||
// directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by
|
||||
// transposing the operation. Since A is Hermitian, we can mark it for
|
||||
// conjugation instead of transpostion (though transposition should be
|
||||
// correctly handled as well).
|
||||
if ( bli_is_right( side ) )
|
||||
// An optimization: If C is row-stored, transpose the entire operation
|
||||
// so as to allow the macro-kernel more favorable access patterns
|
||||
// through C.
|
||||
if ( bli_obj_is_row_stored( *c ) )
|
||||
{
|
||||
bli_toggle_side( side );
|
||||
bli_obj_toggle_conj( a_local );
|
||||
bli_obj_toggle_trans( b_local );
|
||||
bli_obj_toggle_trans( c_local );
|
||||
bli_obj_induce_trans( b_local );
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Create an object to hold a copy-cast of alpha. Notice that we use
|
||||
// the target datatype of matrix A.
|
||||
dt_alpha = dt_targ_a;
|
||||
// Swap A and B if multiplying A from the right so that "B" contains
|
||||
// the Hermitian matrix.
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
// Set the target and execution datatypes of the objects, and apply
|
||||
// any transformations necessary to handle mixed domain computation.
|
||||
bli_gemm_set_targ_exec_datatypes( &a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
&dt_alpha,
|
||||
&dt_beta,
|
||||
&pack_c );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
// Create an object to hold a copy-cast of beta. Notice that we use
|
||||
// the datatype of C.
|
||||
dt_beta = bli_obj_datatype( *c );
|
||||
// Create an object to hold a copy-cast of beta.
|
||||
bli_obj_init_scalar_copy_of( dt_beta,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Choose the control tree.
|
||||
cntl = hemm_cntl;
|
||||
|
||||
|
||||
@@ -118,7 +118,8 @@ void bli_hemm_cntl_init()
|
||||
hemm_kr,
|
||||
hemm_nr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
//FALSE, // already dense; densify not necessary
|
||||
TRUE, // densify (if needed)
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
|
||||
@@ -51,15 +51,13 @@ void bli_her2k( obj_t* alpha,
|
||||
obj_t alpha_conj_local;
|
||||
obj_t beta_local;
|
||||
obj_t c_local;
|
||||
obj_t ah;
|
||||
obj_t bh;
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_b;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
obj_t a_local;
|
||||
obj_t bh_local;
|
||||
obj_t b_local;
|
||||
obj_t ah_local;
|
||||
num_t dt_alpha;
|
||||
num_t dt_beta;
|
||||
//bool_t pack_c = FALSE;
|
||||
bool_t pack_c;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
@@ -72,84 +70,76 @@ void bli_her2k( obj_t* alpha,
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C so we can reset it as the root object (in case it is not
|
||||
// already a root object).
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *b, b_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// Create objects to track A' and B' (for the second rank-k update).
|
||||
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah );
|
||||
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *b, bh );
|
||||
// For her2k, the first and second right-hand "B" operands are simply B'
|
||||
// and A'.
|
||||
bli_obj_alias_to( *b, bh_local );
|
||||
bli_obj_induce_trans( bh_local );
|
||||
bli_obj_toggle_conj( bh_local );
|
||||
bli_obj_alias_to( *a, ah_local );
|
||||
bli_obj_induce_trans( ah_local );
|
||||
bli_obj_toggle_conj( ah_local );
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
//bli_her2k_get_target_datatypes( a,
|
||||
// b,
|
||||
// c,
|
||||
// &dt_targ_a,
|
||||
// &dt_targ_b,
|
||||
// &dt_targ_c,
|
||||
// &pack_c );
|
||||
// An optimization: If C is row-stored, transpose the entire operation
|
||||
// so as to allow the macro-kernel more favorable access patterns
|
||||
// through C. (The effect of the transposition of A and A' is negligible
|
||||
// because those operands are always packed to contiguous memory.)
|
||||
if ( bli_obj_is_row_stored( c_local ) )
|
||||
{
|
||||
bli_obj_toggle_conj( a_local );
|
||||
bli_obj_toggle_conj( bh_local );
|
||||
bli_obj_toggle_conj( b_local );
|
||||
bli_obj_toggle_conj( ah_local );
|
||||
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_b = bli_obj_datatype( *b );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_b, *b );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
// Set the target and execution datatypes of the objects, and apply
|
||||
// any transformations necessary to handle mixed domain computation.
|
||||
bli_her2k_set_targ_exec_datatypes( &a_local,
|
||||
&bh_local,
|
||||
&b_local,
|
||||
&ah_local,
|
||||
&c_local,
|
||||
&dt_alpha,
|
||||
&dt_beta,
|
||||
&pack_c );
|
||||
|
||||
// Determine the execution datatype.
|
||||
dt_exec = dt_targ_a;
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, *b );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha. Notice that we use
|
||||
// the target datatype of matrix a. By inspecting the table above,
|
||||
// this clearly works for cases (0) through (4), (6), and (7). It
|
||||
// Also works for case (5) since it is transformed into case (6) by
|
||||
// the above code.
|
||||
dt_alpha = dt_targ_a;
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
// Create an object to hold a copy-cast of conj(alpha).
|
||||
dt_alpha = dt_targ_b;
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_conj_local );
|
||||
|
||||
// Create an object to hold a copy-cast of beta. Notice that we use
|
||||
// the datatype of c. Here's why: If c is real and beta is complex,
|
||||
// there is no reason to keep beta_local in the complex domain since
|
||||
// the complex part of beta*c will not be stored. If c is complex and
|
||||
// beta is real then beta is harmlessly promoted to complex.
|
||||
dt_beta = bli_obj_datatype( *c );
|
||||
// Create an object to hold a copy-cast of beta.
|
||||
bli_obj_init_scalar_copy_of( dt_beta,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
// Choose the control tree based on whether it was determined we need
|
||||
// to pack c.
|
||||
//if ( pack_c ) her2k_cntl = her2k_cntl_packabc;
|
||||
//else her2k_cntl = her2k_cntl_packab;
|
||||
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Choose the control tree.
|
||||
cntl = her2k_cntl;
|
||||
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_her2k_int( &alpha_local,
|
||||
a,
|
||||
&bh,
|
||||
&a_local,
|
||||
&bh_local,
|
||||
&alpha_conj_local,
|
||||
b,
|
||||
&ah,
|
||||
&b_local,
|
||||
&ah_local,
|
||||
&beta_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "bli_her2k_cntl.h"
|
||||
#include "bli_her2k_check.h"
|
||||
#include "bli_her2k_int.h"
|
||||
#include "bli_her2k_target.h"
|
||||
|
||||
#include "bli_her2k_l_blk_var1.h"
|
||||
#include "bli_her2k_u_blk_var1.h"
|
||||
|
||||
99
frame/3/her2k/bli_her2k_target.c
Normal file
99
frame/3/her2k/bli_her2k_target.c
Normal file
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_her2k_set_targ_exec_datatypes( obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
num_t* dt_alpha,
|
||||
num_t* dt_beta,
|
||||
bool_t* pack_c )
|
||||
{
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_bh;
|
||||
num_t dt_targ_b;
|
||||
num_t dt_targ_ah;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
/*
|
||||
bli_gemm_get_target_datatypes( a,
|
||||
bh,
|
||||
c,
|
||||
&dt_targ_a,
|
||||
&dt_targ_bh,
|
||||
&dt_targ_c,
|
||||
pack_c );
|
||||
*/
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_bh = bli_obj_datatype( *bh );
|
||||
dt_targ_b = bli_obj_datatype( *b );
|
||||
dt_targ_ah = bli_obj_datatype( *ah );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
dt_exec = dt_targ_a;
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_bh, *bh );
|
||||
bli_obj_set_target_datatype( dt_targ_b, *b );
|
||||
bli_obj_set_target_datatype( dt_targ_ah, *ah );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, *bh );
|
||||
bli_obj_set_execution_datatype( dt_exec, *b );
|
||||
bli_obj_set_execution_datatype( dt_exec, *ah );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
// Notice that we use the target datatype of matrix a. By inspecting
|
||||
// the table above, this clearly works for cases (0) through (4), (6),
|
||||
// and (7). It also works for case (5) since it is transformed into
|
||||
// case (6) by the above code.
|
||||
*dt_alpha = bli_obj_target_datatype( *a );
|
||||
|
||||
// Notice that we use the target datatype of matrix a. By inspecting
|
||||
// the table above, this clearly works for cases (0) through (4), (6),
|
||||
// and (7). It also works for case (5) since it is transformed into
|
||||
// case (6) by the above code.
|
||||
*dt_beta = bli_obj_datatype( *c );
|
||||
|
||||
// For now disable packing of C.
|
||||
*pack_c = FALSE;
|
||||
}
|
||||
|
||||
43
frame/3/her2k/bli_her2k_target.h
Normal file
43
frame/3/her2k/bli_her2k_target.h
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_her2k_set_targ_exec_datatypes( obj_t* a,
|
||||
obj_t* bh,
|
||||
obj_t* b,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
num_t* dt_alpha,
|
||||
num_t* dt_beta,
|
||||
bool_t* pack_c );
|
||||
|
||||
@@ -47,15 +47,12 @@ void bli_herk( obj_t* alpha,
|
||||
herk_t* cntl;
|
||||
obj_t alpha_local;
|
||||
obj_t beta_local;
|
||||
obj_t a_local;
|
||||
obj_t ah_local;
|
||||
obj_t c_local;
|
||||
obj_t ah;
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_ah;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
num_t dt_alpha;
|
||||
num_t dt_beta;
|
||||
//bool_t pack_c = FALSE;
|
||||
bool_t pack_c;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
@@ -68,84 +65,58 @@ void bli_herk( obj_t* alpha,
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C so we can reset it as the root object (in case it is not
|
||||
// already a root object).
|
||||
// Alias A and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// For herk, the right-hand "B" operand is simply A'.
|
||||
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah );
|
||||
bli_obj_alias_to( *a, ah_local );
|
||||
bli_obj_induce_trans( ah_local );
|
||||
bli_obj_toggle_conj( ah_local );
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
//bli_herk_get_target_datatypes( a,
|
||||
// c,
|
||||
// &dt_targ_a,
|
||||
// &dt_targ_c,
|
||||
// &pack_c );
|
||||
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_ah = bli_obj_datatype( *a );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
// An optimization: If C is row-stored, transpose the entire operation
|
||||
// so as to allow the macro-kernel more favorable access patterns
|
||||
// through C. (The effect of the transposition of A and A' is negligible
|
||||
// because those operands are always packed to contiguous memory.)
|
||||
if ( bli_obj_is_row_stored( c_local ) )
|
||||
{
|
||||
bli_obj_toggle_conj( a_local );
|
||||
bli_obj_toggle_conj( ah_local );
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_ah, ah );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Determine the execution datatype. For herk, the execution
|
||||
// datatype is always the target datatype of a.
|
||||
dt_exec = dt_targ_a;
|
||||
// Set the target and execution datatypes of the objects, and apply
|
||||
// any transformations necessary to handle mixed domain computation.
|
||||
bli_herk_set_targ_exec_datatypes( &a_local,
|
||||
&ah_local,
|
||||
&c_local,
|
||||
&dt_alpha,
|
||||
&dt_beta,
|
||||
&pack_c );
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, ah );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
// Note that the precisions of the target datatypes of a and c
|
||||
// match. The domains, however, are not necessarily the same. There
|
||||
// are four possible combinations of target domains:
|
||||
//
|
||||
// case input target exec pack notes
|
||||
// domain domain domain c?
|
||||
// c+=a*a' c+=a*a'
|
||||
// (0) r r r r r r r
|
||||
// (1) r c c c c c c yes a*a' demoted to real
|
||||
// (2) c r r r r r r yes copynzm used to update c
|
||||
// (3) c c c c c c c
|
||||
|
||||
// Create an object to hold a copy-cast of alpha. Notice that we use
|
||||
// the target datatype of matrix a. By inspecting the table above,
|
||||
// this clearly works for cases (0) through (4), (6), and (7). It
|
||||
// Also works for case (5) since it is transformed into case (6) by
|
||||
// the above code.
|
||||
dt_alpha = dt_targ_a;
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
// Create an object to hold a copy-cast of beta. Notice that we use
|
||||
// the datatype of c. Here's why: If c is real and beta is complex,
|
||||
// there is no reason to keep beta_local in the complex domain since
|
||||
// the complex part of beta*c will not be stored. If c is complex and
|
||||
// beta is real then beta is harmlessly promoted to complex.
|
||||
dt_beta = bli_obj_datatype( *c );
|
||||
// Create an object to hold a copy-cast of beta.
|
||||
bli_obj_init_scalar_copy_of( dt_beta,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
// Choose the control tree based on whether it was determined we need
|
||||
// to pack c.
|
||||
//if ( pack_c ) herk_cntl = herk_cntl_packabc;
|
||||
//else herk_cntl = herk_cntl_packab;
|
||||
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Choose the control tree.
|
||||
cntl = herk_cntl;
|
||||
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_herk_int( &alpha_local,
|
||||
a,
|
||||
&ah,
|
||||
&a_local,
|
||||
&ah_local,
|
||||
&beta_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "bli_herk_cntl.h"
|
||||
#include "bli_herk_check.h"
|
||||
#include "bli_herk_int.h"
|
||||
#include "bli_herk_target.h"
|
||||
|
||||
#include "bli_herk_l_blk_var1.h"
|
||||
#include "bli_herk_u_blk_var1.h"
|
||||
|
||||
203
frame/3/herk/bli_herk_target.c
Normal file
203
frame/3/herk/bli_herk_target.c
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_herk_set_targ_exec_datatypes( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
num_t* dt_alpha,
|
||||
num_t* dt_beta,
|
||||
bool_t* pack_c )
|
||||
{
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_ah;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
/*
|
||||
bli_herk_get_target_datatypes( a,
|
||||
c,
|
||||
&dt_targ_a,
|
||||
&dt_targ_c,
|
||||
pack_c );
|
||||
*/
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_ah = bli_obj_datatype( *ah );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
dt_exec = dt_targ_a;
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_ah, *ah );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, *ah );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
*dt_alpha = bli_obj_target_datatype( *a );
|
||||
|
||||
*dt_beta = bli_obj_datatype( *c );
|
||||
|
||||
// For now disable packing of C.
|
||||
*pack_c = FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
void bli_herk_get_target_datatypes( obj_t* a,
|
||||
obj_t* c,
|
||||
num_t* dt_a,
|
||||
num_t* dt_c,
|
||||
bool_t* pack_c )
|
||||
{
|
||||
prec_t tp_a, tp_c;
|
||||
dom_t td_a, td_c;
|
||||
|
||||
// Determine the target domains for each object.
|
||||
bli_herk_get_target_domain( a,
|
||||
c,
|
||||
&td_a,
|
||||
&td_c,
|
||||
pack_c );
|
||||
|
||||
// Determine the target precisions for each object.
|
||||
bli_herk_get_target_prec( a,
|
||||
c,
|
||||
&tp_a,
|
||||
&tp_c,
|
||||
pack_c );
|
||||
|
||||
// The target datatype of an object is simply the union of its
|
||||
// target domain and target precision.
|
||||
*dt_a = td_a | tp_a;
|
||||
*dt_c = td_c | tp_c;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
void bli_herk_get_target_domain( obj_t* a,
|
||||
obj_t* c,
|
||||
dom_t* td_a,
|
||||
dom_t* td_c,
|
||||
bool_t* pack_c )
|
||||
{
|
||||
dom_t d_a = bli_obj_domain( *a );
|
||||
dom_t d_c = bli_obj_domain( *c );
|
||||
|
||||
// Note that the precisions of the target datatypes of a and c
|
||||
// match. The domains, however, are not necessarily the same. There
|
||||
// are four possible combinations of target domains:
|
||||
//
|
||||
// case input target exec pack notes
|
||||
// domain domain domain c?
|
||||
// c+=a*a' c+=a*a'
|
||||
// (0) r r r r r r r
|
||||
// (1) r c c c c c c yes a*a' demoted to real
|
||||
// (2) c r r r r r r yes copynzm used to update c
|
||||
// (3) c c c c c c c
|
||||
|
||||
if ( bli_is_real( d_c ) )
|
||||
{
|
||||
if ( bli_is_complex( d_a ) )
|
||||
{
|
||||
*td_c = *td_a = BLIS_COMPLEX;
|
||||
*pack_c = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
*td_c = *td_a = BLIS_REAL;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_complex( d_c ) )
|
||||
{
|
||||
*td_a = d_a;
|
||||
|
||||
if ( bli_is_real( d_a ) )
|
||||
{
|
||||
*td_c = BLIS_REAL;
|
||||
*pack_c = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
*td_c = BLIS_COMPLEX;
|
||||
|
||||
if ( bli_obj_is_real( *a ) ) *pack_c = TRUE;
|
||||
|
||||
if ( bli_obj_is_complex( *a ) )
|
||||
{
|
||||
if ( !bli_obj_is_col_stored( *c ) ) *pack_c = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_herk_get_target_prec( obj_t* a,
|
||||
obj_t* c,
|
||||
prec_t* tp_a,
|
||||
prec_t* tp_c,
|
||||
bool_t* pack_c )
|
||||
{
|
||||
prec_t p_a = bli_obj_precision( *a );
|
||||
prec_t p_c = bli_obj_precision( *c );
|
||||
|
||||
if ( bli_is_single_prec( p_c ) )
|
||||
{
|
||||
if ( bli_is_double_prec( p_a ) )
|
||||
{
|
||||
*tp_c = *tp_a = BLIS_DOUBLE_PREC;
|
||||
*pack_c = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
*tp_c = *tp_a = BLIS_SINGLE_PREC;
|
||||
}
|
||||
}
|
||||
else // if ( bli_is_double_prec( p_c ) )
|
||||
{
|
||||
if ( bli_is_single_prec( p_a ) )
|
||||
{
|
||||
*tp_c = *tp_a = BLIS_SINGLE_PREC;
|
||||
*pack_c = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
*tp_c = *tp_a = BLIS_DOUBLE_PREC;
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
66
frame/3/herk/bli_herk_target.h
Normal file
66
frame/3/herk/bli_herk_target.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_herk_set_targ_exec_datatypes( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
num_t* dt_alpha,
|
||||
num_t* dt_beta,
|
||||
bool_t* pack_c );
|
||||
|
||||
/*
|
||||
void bli_herk_get_target_datatypes( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
num_t* dt_a,
|
||||
num_t* dt_b,
|
||||
num_t* dt_c,
|
||||
bool_t* pack_c );
|
||||
|
||||
void bli_herk_get_target_domain( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
dom_t* td_a,
|
||||
dom_t* td_b,
|
||||
dom_t* td_c,
|
||||
bool_t* pack_c );
|
||||
|
||||
void bli_herk_get_target_prec( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
prec_t* tp_a,
|
||||
prec_t* tp_b,
|
||||
prec_t* tp_c,
|
||||
bool_t* pack_c );
|
||||
*/
|
||||
@@ -52,11 +52,9 @@ void bli_symm( side_t side,
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_b;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_alpha;
|
||||
num_t dt_beta;
|
||||
bool_t pack_c;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
@@ -69,45 +67,51 @@ void bli_symm( side_t side,
|
||||
return;
|
||||
}
|
||||
|
||||
// For now, assume the storage datatypes are the desired target
|
||||
// datatypes.
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_b = bli_obj_datatype( *b );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
|
||||
// Alias A and B in case we need to induce the right side case.
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *b, b_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
// We implement symm in terms of gemm. But in order to do so we must make
|
||||
// sure matrix A is on the correct side for our gemm kernel. We assume
|
||||
// gemm is implemented with a block-panel kernel, thus, we will only
|
||||
// directly support the BLIS_LEFT case. We handle the BLIS_RIGHT case by
|
||||
// transposing the operation. Since A is symmetric, we do not mark it
|
||||
// for any conjugation or transposition.
|
||||
if ( bli_is_right( side ) )
|
||||
// An optimization: If C is row-stored, transpose the entire operation
|
||||
// so as to allow the macro-kernel more favorable access patterns
|
||||
// through C.
|
||||
if ( bli_obj_is_row_stored( *c ) )
|
||||
{
|
||||
bli_obj_toggle_trans( b_local );
|
||||
bli_obj_toggle_trans( c_local );
|
||||
bli_toggle_side( side );
|
||||
bli_obj_induce_trans( b_local );
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Create an object to hold a copy-cast of alpha. Notice that we use
|
||||
// the target datatype of matrix A.
|
||||
dt_alpha = dt_targ_a;
|
||||
// Swap A and B if multiplying A from the right so that "B" contains
|
||||
// the symmetric matrix.
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
// Set the target and execution datatypes of the objects, and apply
|
||||
// any transformations necessary to handle mixed domain computation.
|
||||
bli_gemm_set_targ_exec_datatypes( &a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
&dt_alpha,
|
||||
&dt_beta,
|
||||
&pack_c );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
// Create an object to hold a copy-cast of beta. Notice that we use
|
||||
// the datatype of C.
|
||||
dt_beta = bli_obj_datatype( *c );
|
||||
// Create an object to hold a copy-cast of beta.
|
||||
bli_obj_init_scalar_copy_of( dt_beta,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Choose the control tree. We can just use hemm since the algorithm
|
||||
// is nearly identical to that of symm.
|
||||
cntl = hemm_cntl;
|
||||
|
||||
@@ -50,15 +50,13 @@ void bli_syr2k( obj_t* alpha,
|
||||
obj_t alpha_local;
|
||||
obj_t beta_local;
|
||||
obj_t c_local;
|
||||
obj_t at;
|
||||
obj_t bt;
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_b;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
obj_t a_local;
|
||||
obj_t bt_local;
|
||||
obj_t b_local;
|
||||
obj_t at_local;
|
||||
num_t dt_alpha;
|
||||
num_t dt_beta;
|
||||
//bool_t pack_c = FALSE;
|
||||
bool_t pack_c;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
@@ -71,69 +69,52 @@ void bli_syr2k( obj_t* alpha,
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C so we can reset it as the root object (in case it is not
|
||||
// already a root object).
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *b, b_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// Create objects to track A^T and B^T (for the second rank-k update).
|
||||
bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at );
|
||||
bli_obj_alias_with_trans( BLIS_TRANSPOSE, *b, bt );
|
||||
// For syr2k, the first and second right-hand "B" operands are simply B'
|
||||
// and A'.
|
||||
bli_obj_alias_to( *b, bt_local );
|
||||
bli_obj_induce_trans( bt_local );
|
||||
bli_obj_alias_to( *a, at_local );
|
||||
bli_obj_induce_trans( at_local );
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
//bli_syr2k_get_target_datatypes( a,
|
||||
// b,
|
||||
// c,
|
||||
// &dt_targ_a,
|
||||
// &dt_targ_b,
|
||||
// &dt_targ_c,
|
||||
// &pack_c );
|
||||
// An optimization: If C is row-stored, transpose the entire operation
|
||||
// so as to allow the macro-kernel more favorable access patterns
|
||||
// through C. (The effect of the transposition of A and A' is negligible
|
||||
// because those operands are always packed to contiguous memory.)
|
||||
if ( bli_obj_is_row_stored( c_local ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_b = bli_obj_datatype( *b );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
// Set the target and execution datatypes of the objects, and apply
|
||||
// any transformations necessary to handle mixed domain computation.
|
||||
bli_her2k_set_targ_exec_datatypes( &a_local,
|
||||
&bt_local,
|
||||
&b_local,
|
||||
&at_local,
|
||||
&c_local,
|
||||
&dt_alpha,
|
||||
&dt_beta,
|
||||
&pack_c );
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_b, *b );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
|
||||
// Determine the execution datatype.
|
||||
dt_exec = dt_targ_a;
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, *b );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
|
||||
// Create an object to hold a copy-cast of alpha. Notice that we use
|
||||
// the target datatype of matrix a. By inspecting the table above,
|
||||
// this clearly works for cases (0) through (4), (6), and (7). It
|
||||
// Also works for case (5) since it is transformed into case (6) by
|
||||
// the above code.
|
||||
dt_alpha = dt_targ_a;
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
// Create an object to hold a copy-cast of beta. Notice that we use
|
||||
// the datatype of c. Here's why: If c is real and beta is complex,
|
||||
// there is no reason to keep beta_local in the complex domain since
|
||||
// the complex part of beta*c will not be stored. If c is complex and
|
||||
// beta is real then beta is harmlessly promoted to complex.
|
||||
dt_beta = bli_obj_datatype( *c );
|
||||
// Create an object to hold a copy-cast of beta.
|
||||
bli_obj_init_scalar_copy_of( dt_beta,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
// Choose the control tree based on whether it was determined we need
|
||||
// to pack c.
|
||||
//if ( pack_c ) syr2k_cntl = her2k_cntl_packabc;
|
||||
//else syr2k_cntl = her2k_cntl_packab;
|
||||
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Choose the control tree. We can just use her2k since the algorithm
|
||||
// is nearly identical to that of syr2k.
|
||||
@@ -141,11 +122,11 @@ void bli_syr2k( obj_t* alpha,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_her2k_int( &alpha_local,
|
||||
a,
|
||||
&bt,
|
||||
&a_local,
|
||||
&bt_local,
|
||||
&alpha_local,
|
||||
b,
|
||||
&at,
|
||||
&b_local,
|
||||
&at_local,
|
||||
&beta_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
|
||||
@@ -47,15 +47,12 @@ void bli_syrk( obj_t* alpha,
|
||||
herk_t* cntl;
|
||||
obj_t alpha_local;
|
||||
obj_t beta_local;
|
||||
obj_t a_local;
|
||||
obj_t at_local;
|
||||
obj_t c_local;
|
||||
obj_t at;
|
||||
num_t dt_targ_a;
|
||||
num_t dt_targ_at;
|
||||
num_t dt_targ_c;
|
||||
num_t dt_exec;
|
||||
num_t dt_alpha;
|
||||
num_t dt_beta;
|
||||
//bool_t pack_c = FALSE;
|
||||
bool_t pack_c;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
@@ -68,78 +65,46 @@ void bli_syrk( obj_t* alpha,
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias C so we can reset it as the root object (in case it is not
|
||||
// already a root object).
|
||||
// Alias A and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// For syrk, the right-hand "B" operand is simply A^T.
|
||||
bli_obj_alias_with_trans( BLIS_TRANSPOSE, *a, at );
|
||||
// For herk, the right-hand "B" operand is simply A^T.
|
||||
bli_obj_alias_to( *a, at_local );
|
||||
bli_obj_induce_trans( at_local );
|
||||
|
||||
// Determine the target datatype of each matrix object.
|
||||
//bli_syrk_get_target_datatypes( a,
|
||||
// c,
|
||||
// &dt_targ_a,
|
||||
// &dt_targ_c,
|
||||
// &pack_c );
|
||||
|
||||
dt_targ_a = bli_obj_datatype( *a );
|
||||
dt_targ_at = bli_obj_datatype( *a );
|
||||
dt_targ_c = bli_obj_datatype( *c );
|
||||
// An optimization: If C is row-stored, transpose the entire operation
|
||||
// so as to allow the macro-kernel more favorable access patterns
|
||||
// through C. (The effect of the transposition of A and A^T is negligible
|
||||
// because those operands are always packed to contiguous memory.)
|
||||
if ( bli_obj_is_row_stored( c_local ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the target datatypes for each matrix object.
|
||||
bli_obj_set_target_datatype( dt_targ_a, *a );
|
||||
bli_obj_set_target_datatype( dt_targ_at, at );
|
||||
bli_obj_set_target_datatype( dt_targ_c, *c );
|
||||
// Set the target and execution datatypes of the objects, and apply
|
||||
// any transformations necessary to handle mixed domain computation.
|
||||
bli_herk_set_targ_exec_datatypes( &a_local,
|
||||
&at_local,
|
||||
&c_local,
|
||||
&dt_alpha,
|
||||
&dt_beta,
|
||||
&pack_c );
|
||||
|
||||
// Determine the execution datatype. For syrk, the execution
|
||||
// datatype is always the target datatype of a.
|
||||
dt_exec = dt_targ_a;
|
||||
|
||||
// Embed the execution datatype in all matrix operands.
|
||||
bli_obj_set_execution_datatype( dt_exec, *a );
|
||||
bli_obj_set_execution_datatype( dt_exec, at );
|
||||
bli_obj_set_execution_datatype( dt_exec, *c );
|
||||
|
||||
// Note that the precisions of the target datatypes of a and c
|
||||
// match. The domains, however, are not necessarily the same. There
|
||||
// are four possible combinations of target domains:
|
||||
//
|
||||
// case input target exec pack notes
|
||||
// domain domain domain c?
|
||||
// c+=a*a' c+=a*a'
|
||||
// (0) r r r r r r r
|
||||
// (1) r c c c c c c yes a*a^T demoted to real
|
||||
// (2) c r r r r r r yes copynzm used to update c
|
||||
// (3) c c c c c c c
|
||||
|
||||
// Create an object to hold a copy-cast of alpha. Notice that we use
|
||||
// the target datatype of matrix a. By inspecting the table above,
|
||||
// this clearly works for cases (0) through (4), (6), and (7). It
|
||||
// Also works for case (5) since it is transformed into case (6) by
|
||||
// the above code.
|
||||
dt_alpha = dt_targ_a;
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_alpha,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
// Create an object to hold a copy-cast of beta. Notice that we use
|
||||
// the datatype of c. Here's why: If c is real and beta is complex,
|
||||
// there is no reason to keep beta_local in the complex domain since
|
||||
// the complex part of beta*c will not be stored. If c is complex and
|
||||
// beta is real then beta is harmlessly promoted to complex.
|
||||
dt_beta = bli_obj_datatype( *c );
|
||||
// Create an object to hold a copy-cast of beta.
|
||||
bli_obj_init_scalar_copy_of( dt_beta,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
// Choose the control tree based on whether it was determined we need
|
||||
// to pack c.
|
||||
//if ( pack_c ) syrk_cntl = herk_cntl_packabc;
|
||||
//else syrk_cntl = herk_cntl_packab;
|
||||
//if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
if ( pack_c ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
// Choose the control tree. We can just use herk since the algorithm
|
||||
// is nearly identical to that of syrk.
|
||||
@@ -147,8 +112,8 @@ void bli_syrk( obj_t* alpha,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_herk_int( &alpha_local,
|
||||
a,
|
||||
&at,
|
||||
&a_local,
|
||||
&at_local,
|
||||
&beta_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
|
||||
@@ -894,6 +894,16 @@ bli_obj_width_stored( obj )
|
||||
: ( bli_obj_buffer_at_off( obj ) ) \
|
||||
)
|
||||
|
||||
|
||||
// Swap objects
|
||||
|
||||
#define bli_obj_swap( a, b ) \
|
||||
{ \
|
||||
obj_t t; \
|
||||
t = b; b = a; a = t; \
|
||||
}
|
||||
|
||||
|
||||
// Swap object pointers
|
||||
|
||||
#define bli_obj_swap_pointers( a, b ) \
|
||||
@@ -902,6 +912,7 @@ bli_obj_width_stored( obj )
|
||||
t = b; b = a; a = t; \
|
||||
}
|
||||
|
||||
|
||||
// If a transposition is needed, induce one: swap dimensions, increments
|
||||
// and offsets, and then clear the trans bit.
|
||||
|
||||
|
||||
@@ -127,13 +127,13 @@
|
||||
|
||||
// min, max, abs
|
||||
|
||||
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
|
||||
#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) )
|
||||
#define bli_abs( a ) ( (a) < 0 ? -(a) : (a) )
|
||||
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
|
||||
#define bli_max( a, b ) ( (a) > (b) ? (a) : (b) )
|
||||
#define bli_abs( a ) ( (a) < 0 ? -(a) : (a) )
|
||||
|
||||
// fmin, fmax, fabs
|
||||
|
||||
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
|
||||
#define bli_min( a, b ) ( (a) < (b) ? (a) : (b) )
|
||||
#define bli_fmin( a, b ) bli_min( a, b )
|
||||
#define bli_fmax( a, b ) bli_max( a, b )
|
||||
#define bli_fabs( a ) ( (a) < 0.0 ? -(a) : (a) )
|
||||
|
||||
Reference in New Issue
Block a user