mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Consolidated packm_blk_var2 and var3.
Details: - Consolidated the functionality previously supported by packm_blk_var2() and packm_blk_var3() into a new variant, packm_blk_var1(). - Updates to packm_gen_cxk(), packm_herm_cxk.c(), and packm_tri_cxk() to accommodate above changes. - Removed packm_blk_var3() and retired packm_blk_var2() to frame/1m/packm/old. - Updated all level-3 _cntl_init() functions so that the new, more versatile packm_blk_var1 is used for all level-3 matrix packing.
This commit is contained in:
@@ -41,9 +41,7 @@
|
||||
|
||||
#include "bli_packm_unb_var1.h"
|
||||
|
||||
#include "bli_packm_blk_var2.h"
|
||||
|
||||
#include "bli_packm_blk_var3.h"
|
||||
#include "bli_packm_blk_var1.h"
|
||||
|
||||
#include "bli_packm_cxk.h"
|
||||
#include "bli_packm_gen_cxk.h"
|
||||
|
||||
@@ -55,10 +55,10 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t pd_p, inc_t ps_p
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
@@ -154,33 +154,35 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
doff_t diagoffp; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros, then we don't need to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) ) return; \
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
@@ -201,38 +203,38 @@ void PASTEMAC(ch,varname )( \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
@@ -240,8 +242,8 @@ void PASTEMAC(ch,varname )( \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) ) ) \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (num_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
@@ -266,19 +268,25 @@ void PASTEMAC(ch,varname )( \
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
/* If the current panel is unstored, do nothing. (Notice that we use
|
||||
the continue statement, so we don't even increment p_begin.)
|
||||
If the current panel intersects the diagonal (and the matrix is
|
||||
triangular), pack only as much as we need (ie: skip over as much
|
||||
as possible on the unstored side of the diagonal).
|
||||
Otherwise, we assume the current panel is full-length. */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \
|
||||
bli_is_triangular( strucc ) ) \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_tri_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
@@ -293,22 +301,22 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
diagoffp = diagoffc_i; \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp = 0; \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
PASTEMAC(ch,packm_tri_cxk)( strucc, \
|
||||
diagoffp, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
@@ -324,9 +332,38 @@ void PASTEMAC(ch,varname )( \
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
PASTEMAC(ch,packm_herm_cxk)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: p_inc should be set to ps_p to properly support
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
panel_len_i = panel_len; \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
|
||||
@@ -337,27 +374,30 @@ void PASTEMAC(ch,varname )( \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: p_inc should be set to ps_p to properly support
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( rs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", panel_dim_max, panel_len_max_i, \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
if ( cs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", panel_len_max_i, panel_dim_max, \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm, packm_blk_var3 )
|
||||
INSERT_GENTFUNC_BASIC( packm, packm_blk_var1 )
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p );
|
||||
|
||||
|
||||
@@ -58,5 +58,5 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_blk_var3 )
|
||||
INSERT_GENTPROT_BASIC( packm_blk_var1 )
|
||||
|
||||
@@ -38,23 +38,20 @@
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict c_begin = c; \
|
||||
ctype* restrict p_begin = p; \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
@@ -84,29 +81,14 @@ void PASTEMAC(ch,varname)( \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
c_begin = c_begin + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa_cast, \
|
||||
c_begin, incc, ldc, \
|
||||
p_begin, ldp ); \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
@@ -121,7 +103,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p_begin + (i )*rs_p; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
@@ -137,7 +119,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p_begin + (j )*cs_p; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
|
||||
@@ -36,17 +36,17 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_gen_cxk )
|
||||
|
||||
@@ -38,23 +38,20 @@
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict c_begin = c; \
|
||||
ctype* restrict p_begin = p; \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
\
|
||||
dim_t i, j; \
|
||||
dim_t panel_len; \
|
||||
@@ -110,122 +107,151 @@ void PASTEMAC(ch,varname)( \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p_begin; \
|
||||
c10 = c_begin; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p_begin + (j )*ldp; \
|
||||
c12 = c_begin + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p_begin + (j )*ldp; \
|
||||
c11 = c_begin + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p_begin; \
|
||||
c10 = c_begin; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p_begin + (j )*ldp; \
|
||||
c12 = c_begin + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p_begin + (j )*ldp; \
|
||||
c11 = c_begin + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to P10. For upper storage, this includes the unstored
|
||||
triangle of C11. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa_cast, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, ldp ); \
|
||||
\
|
||||
/* Pack to P12. For lower storage, this includes the unstored
|
||||
triangle of C11. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa_cast, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangule of C11 to P11. */ \
|
||||
PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix C is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of P11 in case the
|
||||
corresponding elements in C11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
/* NOTE: We can directly increment p11 since we are done
|
||||
using p11 for the remainder of the function. */ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,seti0s)( *p11 ); \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
p11 += rs_p11 + cs_p11; \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the full panel. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
{ \
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p + (j )*ldp; \
|
||||
c11 = c + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p + (j )*ldp; \
|
||||
c11 = c + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to P10. For upper storage, this includes the unstored
|
||||
triangle of C11. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, ldp ); \
|
||||
\
|
||||
/* Pack to P12. For lower storage, this includes the unstored
|
||||
triangle of C11. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangule of C11 to P11. */ \
|
||||
PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix C is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of P11 in case the
|
||||
corresponding elements in C11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
/* NOTE: We can directly increment p11 since we are done
|
||||
using p11 for the remainder of the function. */ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,seti0s)( *p11 ); \
|
||||
\
|
||||
p11 += rs_p11 + cs_p11; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
@@ -241,7 +267,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p_begin + (i )*rs_p; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
@@ -257,7 +283,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p_begin + (j )*cs_p; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
|
||||
@@ -36,17 +36,17 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_herm_cxk )
|
||||
|
||||
@@ -42,9 +42,9 @@ typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
static FUNCPTR_T vars[6][3] =
|
||||
{
|
||||
// unblocked optimized unblocked blocked
|
||||
{ bli_packm_unb_var1, NULL, NULL, },
|
||||
{ NULL, NULL, bli_packm_blk_var2 },
|
||||
{ NULL, NULL, bli_packm_blk_var3 },
|
||||
{ bli_packm_unb_var1, NULL, bli_packm_blk_var1 },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
|
||||
@@ -38,25 +38,22 @@
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* kappa, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* p, inc_t rs_p, inc_t cs_p \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict c_begin = c; \
|
||||
ctype* restrict p_begin = p; \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
@@ -90,9 +87,9 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa_cast, \
|
||||
c_begin, incc, ldc, \
|
||||
p_begin, ldp ); \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
\
|
||||
/* If the diagonal of C is implicitly unit, set the diagonal of
|
||||
the packed panel to unit. */ \
|
||||
@@ -101,8 +98,8 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa_cast, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
kappa, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
@@ -111,7 +108,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,invertd_unb_var1)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of P to zero. To do this,
|
||||
@@ -131,7 +128,7 @@ void PASTEMAC(ch,varname)( \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
@@ -146,7 +143,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p_begin + (i )*rs_p; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
@@ -162,7 +159,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p_begin + (j )*cs_p; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
@@ -184,7 +181,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* p_edge = p_begin + (i )*rs_p + (j )*cs_p; \
|
||||
ctype* p_edge = p + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setd_unb_var1)( 0, \
|
||||
m_br, \
|
||||
@@ -199,10 +196,10 @@ void PASTEMAC(ch,varname)( \
|
||||
/*
|
||||
if ( rs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", m_panel_max, n_panel_max, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
if ( cs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", m_panel_max, n_panel_max, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
|
||||
@@ -36,19 +36,19 @@
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* kappa, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* p, inc_t rs_p, inc_t cs_p \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_tri_cxk )
|
||||
|
||||
@@ -218,12 +218,10 @@ void PASTEMAC(ch,varname )( \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
p_begin = p_cast + (ip )*ps_p; \
|
||||
\
|
||||
/* If the current panel intersects the diagonal and C is either
|
||||
upper- or lower-stored, then we assume C is symmetric or
|
||||
Hermitian and that it must be densified.
|
||||
Otherwise, we pack the panel all at once. */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
|
||||
bli_is_upper_or_lower( uploc ) ) \
|
||||
/* Call a specialized packm kernel wrapper for Hermitian and
|
||||
symmetric matrices. Otherwise, call the kernel wrapper for
|
||||
general matrices. */ \
|
||||
if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk)( strucc, \
|
||||
diagoffc_i, \
|
||||
@@ -237,7 +235,7 @@ void PASTEMAC(ch,varname )( \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
else \
|
||||
else /* if ( bli_is_general( strucc ) ) */ \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk)( strucc, \
|
||||
diagoffc_i, \
|
||||
@@ -99,7 +99,7 @@ void bli_gemm_cntl_init()
|
||||
gemm_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm_mr,
|
||||
gemm_kr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
@@ -112,7 +112,7 @@ void bli_gemm_cntl_init()
|
||||
gemm_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm_kr,
|
||||
gemm_nr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
|
||||
@@ -62,7 +62,7 @@ void bli_herk_cntl_init()
|
||||
herk_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm_mr,
|
||||
gemm_kr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
@@ -75,7 +75,7 @@ void bli_herk_cntl_init()
|
||||
herk_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm_kr,
|
||||
gemm_nr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
|
||||
@@ -73,7 +73,7 @@ void bli_trmm_cntl_init()
|
||||
trmm_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3, // pack panels of A compactly
|
||||
BLIS_VARIANT1, // pack panels of A compactly
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm_mr,
|
||||
@@ -88,7 +88,7 @@ void bli_trmm_cntl_init()
|
||||
trmm_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm_mr,
|
||||
@@ -104,7 +104,7 @@ void bli_trmm_cntl_init()
|
||||
trmm_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to nr.
|
||||
gemm_mr,
|
||||
@@ -119,7 +119,7 @@ void bli_trmm_cntl_init()
|
||||
trmm_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3, // pack panels of B compactly
|
||||
BLIS_VARIANT1, // pack panels of B compactly
|
||||
// IMPORTANT: m dim multiple here must be nr
|
||||
// since "k" dim multiple is set to nr above.
|
||||
gemm_nr,
|
||||
|
||||
@@ -88,7 +88,7 @@ void bli_trsm_cntl_init()
|
||||
trsm_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3, // pack panels of A compactly
|
||||
BLIS_VARIANT1, // pack panels of A compactly
|
||||
// IMPORTANT: n dim multiple must be mr to
|
||||
// support right and bottom-right edge cases
|
||||
gemm_mr,
|
||||
@@ -103,7 +103,7 @@ void bli_trsm_cntl_init()
|
||||
trsm_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: m dim multiple must be mr since
|
||||
// B_pack is updated (ie: serves as C) in trsm
|
||||
gemm_mr,
|
||||
@@ -119,7 +119,7 @@ void bli_trsm_cntl_init()
|
||||
trsm_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm_nr,
|
||||
gemm_mr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
@@ -132,7 +132,7 @@ void bli_trsm_cntl_init()
|
||||
trsm_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3, // pack panels of B compactly
|
||||
BLIS_VARIANT1, // pack panels of B compactly
|
||||
gemm_mr,
|
||||
gemm_mr,
|
||||
TRUE, // densify
|
||||
|
||||
@@ -190,6 +190,12 @@
|
||||
\
|
||||
( struc == BLIS_TRIANGULAR )
|
||||
|
||||
#define bli_is_herm_or_symm( struc ) \
|
||||
\
|
||||
( bli_is_hermitian( struc ) || \
|
||||
bli_is_symmetric( struc ) )
|
||||
|
||||
|
||||
|
||||
// conj
|
||||
|
||||
|
||||
@@ -251,10 +251,10 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
|
||||
&b, &bp );
|
||||
|
||||
// Pack the contents of a to ap.
|
||||
bli_packm_blk_var3( &a, &ap );
|
||||
bli_packm_blk_var1( &a, &ap );
|
||||
|
||||
// Pack the contents of b to bp.
|
||||
bli_packm_blk_var2( &b, &bp );
|
||||
bli_packm_blk_var1( &b, &bp );
|
||||
|
||||
|
||||
// Create subpartitions from the a and b panels.
|
||||
|
||||
@@ -217,7 +217,7 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
|
||||
&b, &bp );
|
||||
|
||||
// Pack the contents of a to ap.
|
||||
bli_packm_blk_var3( &a, &ap );
|
||||
bli_packm_blk_var1( &a, &ap );
|
||||
|
||||
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
|
||||
Reference in New Issue
Block a user