Refactored packm variants.

Details:
- Revised packm_blk_var2() and _var3() by encapsulating the general,
  hermitian/symmetric, and triangular panel-packing subproblems into
  separate functions: packm_gen_cxk(), packm_herm_cxk(), and
  packm_tri_cxk(), respectively. Also, homogenized the packm code as
  well as the new specialized packm_*_cxk() code to further improve
  readability.
This commit is contained in:
Field G. Van Zee
2014-02-09 10:07:37 -06:00
parent 6c80670287
commit 32d8f264ae
10 changed files with 890 additions and 427 deletions

View File

@@ -46,4 +46,7 @@
#include "bli_packm_blk_var3.h"
#include "bli_packm_cxk.h"
#include "bli_packm_gen_cxk.h"
#include "bli_packm_herm_cxk.h"
#include "bli_packm_tri_cxk.h"

View File

@@ -131,53 +131,29 @@ void PASTEMAC(ch,varname )( \
dim_t pd_p, inc_t ps_p \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict zero = PASTEMAC(ch,0); \
\
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t num_iter; \
dim_t it, ic, ip; \
dim_t i, j; \
\
dim_t ic0, ip0; \
dim_t ic_inc, ip_inc; \
dim_t panel_dim; \
dim_t panel_len; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
doff_t diagoffc_i_abs; \
\
dim_t panel_dim_i; \
inc_t vs_c; \
inc_t incc, ldc; \
inc_t ldp; \
dim_t* m_panel; \
dim_t* n_panel; \
dim_t m_panel_max; \
dim_t n_panel_max; \
conj_t conjc; \
\
ctype* restrict c10; \
ctype* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
ctype* restrict c11; \
ctype* restrict p11; \
dim_t p11_m; \
dim_t p11_n; \
inc_t rs_p11, cs_p11; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
@@ -200,37 +176,25 @@ void PASTEMAC(ch,varname )( \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len = m; \
panel_dim = pd_p; \
incc = cs_c; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim; \
ldp = rs_p; \
m_panel = &m; \
n_panel = &panel_dim_i; \
m_panel_max = m_max; \
n_panel_max = panel_dim; \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len = n; \
panel_dim = pd_p; \
incc = rs_c; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim; \
ldp = cs_p; \
m_panel = &panel_dim_i; \
n_panel = &n; \
m_panel_max = panel_dim; \
n_panel_max = n_max; \
rs_p11 = 1; \
cs_p11 = cs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
@@ -256,216 +220,36 @@ void PASTEMAC(ch,varname )( \
\
/* If the current panel intersects the diagonal and C is either
upper- or lower-stored, then we assume C is symmetric or
Hermitian and that it must be densified (note we don't even
bother passing in a densify parameter), in which case we pack
the panel in three stages.
Hermitian and that it must be densified.
Otherwise, we pack the panel all at once. */ \
if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
bli_is_upper_or_lower( uploc ) ) \
{ \
diagoffc_i_abs = bli_abs( diagoffc_i ); \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim_i; \
p10_len = diagoffc_i_abs; \
p10 = p_begin; \
c10 = c_begin; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim_i; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_i_abs - j; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
p11_m = panel_dim_i; \
p11_n = panel_dim_i; \
j = diagoffc_i_abs; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim_i; \
p10_len = diagoffc_i_abs + panel_dim_i; \
diagoffc10 = diagoffc_i; \
p10 = p_begin; \
c10 = c_begin; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim_i; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
p11_m = panel_dim_i; \
p11_n = panel_dim_i; \
j = diagoffc_i_abs; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
\
/* Pack to P10. For upper storage, this includes the unstored
triangle of C11. */ \
PASTEMAC(ch,packm_cxk)( conjc10, \
p10_dim, \
p10_len, \
kappa_cast, \
c10, incc10, ldc10, \
p10, ldp ); \
\
/* Pack to P12. For lower storage, this includes the unstored
triangle of C11. */ \
PASTEMAC(ch,packm_cxk)( conjc12, \
p12_dim, \
p12_len, \
kappa_cast, \
c12, incc12, ldc12, \
p12, ldp ); \
\
/* Pack the stored triangule of C11 to P11. */ \
PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
uploc, \
conjc, \
p11_m, \
p11_n, \
kappa_cast, \
c11, rs_c, cs_c, \
p11, rs_p11, cs_p11 ); \
\
/* If source matrix C is Hermitian, we have to zero out the
imaginary components of the diagonal of P11 in case the
corresponding elements in C11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
/* NOTE: We can directly increment p11 since we are done
using p11 for the remainder of the function. */ \
for ( i = 0; i < p11_m; ++i ) \
{ \
PASTEMAC(ch,seti0s)( *p11 ); \
\
p11 += rs_p11 + cs_p11; \
} \
} \
PASTEMAC(ch,packm_herm_cxk)( strucc, \
diagoffc_i, \
uploc, \
conjc, \
*m_panel, \
*n_panel, \
m_panel_max, \
n_panel_max, \
kappa, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
else \
{ \
/* Note that the following code executes if the current panel either:
- does not intersect the diagonal, or
- does intersect the diagonal, BUT the matrix is general
which means the entire current panel can be copied at once. */ \
\
/* We use some c10-specific variables here because we might need
to change them if the current panel is unstored. (The values
below are used if the current panel is stored.) */ \
c10 = c_begin; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, and so we take conjugation into account. (Note
this implicitly assumes we are operating on a symmetric or
Hermitian matrix, since a general matrix would not contain
any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \
{ \
c10 = c10 + diagoffc_i * ( doff_t )cs_c + \
-diagoffc_i * ( doff_t )rs_c; \
bli_swap_incs( incc10, ldc10 ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
\
/* Pack the current panel. */ \
PASTEMAC(ch,packm_cxk)( conjc10, \
panel_dim_i, \
panel_len, \
kappa_cast, \
c10, incc10, ldc10, \
p_begin, ldp ); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: c", panel_len, panel_dim_i, \
c_begin, ldc, incc, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: p copied", panel_len, panel_dim_i, \
p_begin, ldp, 1, "%5.2f", "" ); \
*/ \
} \
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This simplifies the
register level micro-kernel in that it does not need to support
different register blockings for the edge cases. */ \
if ( *m_panel != m_panel_max ) \
{ \
dim_t i = *m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p_begin + (i )*rs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( *n_panel != n_panel_max ) \
{ \
dim_t j = *n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p_begin + (j )*cs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
PASTEMAC(ch,packm_gen_cxk)( strucc, \
diagoffc_i, \
uploc, \
conjc, \
*m_panel, \
*n_panel, \
m_panel_max, \
n_panel_max, \
kappa, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/*
@@ -474,7 +258,7 @@ void PASTEMAC(ch,varname )( \
p_begin, 1, cs_p, "%4.1f", "" ); \
if ( cs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \
p_begin, panel_dim, 1, "%8.5f", "" ); \
p_begin, rs_p, 1, "%4.1f", "" ); \
*/ \
} \
}

View File

@@ -146,7 +146,6 @@ void PASTEMAC(ch,varname )( \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
@@ -155,7 +154,7 @@ void PASTEMAC(ch,varname )( \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
dim_t panel_dim; \
dim_t panel_dim_max; \
dim_t panel_len; \
dim_t panel_len_max; \
doff_t diagoffc_i; \
@@ -165,10 +164,12 @@ void PASTEMAC(ch,varname )( \
dim_t panel_len_max_i; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t incc, ldc; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel; \
dim_t* n_panel; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
\
ctype* restrict c_use; \
@@ -203,16 +204,17 @@ void PASTEMAC(ch,varname )( \
iter_dim = n; \
panel_len = m; \
panel_len_max = m_max; \
panel_dim = pd_p; \
incc = cs_c; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim; \
diagoffc_inc = -( doff_t)panel_dim_max; \
ldp = rs_p; \
m_panel = &m; \
n_panel = &panel_dim_i; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
{ \
@@ -220,35 +222,36 @@ void PASTEMAC(ch,varname )( \
iter_dim = m; \
panel_len = n; \
panel_len_max = n_max; \
panel_dim = pd_p; \
incc = rs_c; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel = &panel_dim_i; \
n_panel = &n; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim + ( iter_dim % panel_dim ? 1 : 0 ); \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) ) || \
( reviflo && bli_is_lower( uploc ) ) ) \
{ \
ic0 = (num_iter - 1) * panel_dim; \
ic_inc = -panel_dim; \
ic0 = (num_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = num_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
@@ -258,7 +261,7 @@ void PASTEMAC(ch,varname )( \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
@@ -269,11 +272,11 @@ void PASTEMAC(ch,varname )( \
triangular), pack only as much as we need (ie: skip over as much
as possible on the unstored side of the diagonal).
Otherwise, we assume the current panel is full-length. */ \
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel, *n_panel ) ) \
if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
continue; \
} \
else if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
else if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \
bli_is_triangular( strucc ) ) \
{ \
/* Sanity check. Diagonals should not intersect the short end of
@@ -284,199 +287,73 @@ void PASTEMAC(ch,varname )( \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_min( panel_len, -diagoffc_i + panel_dim_i ); \
panel_len_max_i = bli_min( panel_len_max, -diagoffc_i + panel_dim ); \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
diagoffp = diagoffc_i; \
} \
else if ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( bli_min( diagoffc_i, 0 ) ); \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp = diagoffc_i + panel_off_i; \
} \
else if ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) \
{ \
panel_off_i = bli_max( diagoffc_i, 0 ); \
panel_len_i = panel_len - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp = diagoffc_i - panel_off_i; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) */ \
{ \
panel_off_i = 0; \
panel_len_i = bli_min( panel_len, diagoffc_i + panel_dim_i ); \
panel_len_max_i = bli_min( panel_len_max, diagoffc_i + panel_dim ); \
diagoffp = diagoffc_i; \
diagoffp = 0; \
} \
\
/* Adjust the pointer to the beginning of the panel in C based on
the offset determined above. */ \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim_i, \
panel_len_i, \
kappa_cast, \
c_use, incc, ldc, \
p_use, ldp ); \
PASTEMAC(ch,packm_tri_cxk)( strucc, \
diagoffp, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
\
/* If the diagonal of C is implicitly unit, set the diagonal of
the packed panel to unit. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \
*m_panel_use, \
*n_panel_use, \
kappa_cast, \
p_use, rs_p, cs_p ); \
} \
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC(ch,invertd_unb_var1)( diagoffp, \
*m_panel_use, \
*n_panel_use, \
p_use, rs_p, cs_p ); \
} \
\
/* Always densify the unstored part of the packed panel. */ \
{ \
uplo_t uplop = uploc; \
\
/* For triangular matrices, we wish to reference the region
strictly opposite the diagonal of C. This amounts to
toggling uploc and then shifting the diagonal offset to
shrink the stored region (by one diagonal). */ \
bli_toggle_uplo( uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
\
/* Set the region opposite the diagonal of P to zero. */ \
PASTEMAC2(ch,ch,setm_unb_var1)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
*m_panel_use, \
*n_panel_use, \
zero, \
p_use, rs_p, cs_p ); \
} \
\
p_inc = ldp * panel_len_max_i; \
} \
else \
{ \
c_use = c_begin; \
p_use = p_begin; \
\
/* Pack a full-length panel. */ \
panel_off_i = 0; \
panel_len_i = panel_len; \
panel_len_max_i = panel_len_max; \
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim_i, \
panel_len_i, \
kappa_cast, \
c_use, incc, ldc, \
p_use, ldp ); \
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
conjc, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
\
p_inc = ldp * panel_len_max_i; \
} \
\
/* If necessary, zero-pad at the edge of the panel dimension (ie: along
the long dimension of the panel). */ \
if ( panel_dim_i != panel_dim ) \
{ \
/* Note that this code does the right thing for both row and
column panels, since an m x n column-stored row panel and an
n x m row-stored column panel look the same in memory. */ \
dim_t i = panel_dim_i; \
dim_t m_edge = panel_dim - i; \
dim_t n_edge = panel_len_max_i; \
inc_t rs_pe = 1; \
inc_t cs_pe = ldp; \
ctype* p_edge = p_begin + (i )*rs_pe; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_pe, cs_pe ); \
} \
\
/* If necessary, zero-pad at the far end of the panel (ie: at the
other side of the long dimension of the panel). */ \
if ( panel_len_i != panel_len_max_i ) \
{ \
/* Note that this code does the right thing for both row and
column panels, since an m x n column-stored row panel and an
n x m row-stored column panel look the same in memory. */ \
/* Note that we set m_edge as panel_dim, and not panel_dim_i;
this is so that we can simultaneously zero out the corner
region (if it exists). */ \
dim_t j = panel_len_i; \
dim_t m_edge = panel_dim; \
dim_t n_edge = panel_len_max_i - j; \
inc_t rs_pe = 1; \
inc_t cs_pe = ldp; \
ctype* p_edge = p_begin + (j )*cs_pe; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_pe, cs_pe ); \
} \
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity. */ \
if ( panel_dim_i != panel_dim && \
panel_len_i != panel_len_max_i ) \
{ \
/* Note that this code does the right thing for both row and
column panels, since an m x n column-stored row panel and an
n x m row-stored column panel look the same in memory. */ \
dim_t i = panel_dim_i; \
dim_t j = panel_len_i; \
dim_t m_br = panel_dim - i; \
dim_t n_br = panel_len_max_i - j; \
inc_t rs_pe = 1; \
inc_t cs_pe = ldp; \
ctype* one = PASTEMAC(ch,1); \
ctype* p_edge = p_begin + (i )*rs_pe + (j )*cs_pe; \
\
PASTEMAC2(ch,ch,setd_unb_var1)( 0, \
m_br, \
n_br, \
one, \
p_edge, rs_pe, cs_pe ); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "packm_var3: setting br unit diag", m_br, n_br, \
p_edge, rs_pe, cs_pe, "%5.2f", "" ); \
*/ \
} \
\
/*
if ( rs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", panel_dim, panel_len_max_i, \
PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", panel_dim_max, panel_len_max_i, \
p_begin, rs_p, cs_p, "%4.1f", "" ); \
if ( cs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", panel_len_max_i, panel_dim, \
PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", panel_len_max_i, panel_dim_max, \
p_begin, rs_p, cs_p, "%4.1f", "" ); \
*/ \
\
\
p_begin += p_inc; \
} \

View File

@@ -52,7 +52,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* beta, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t ldp \
void* p, inc_t ldp \
);
INSERT_GENTPROT_BASIC( packm_cxk )

View File

@@ -0,0 +1,154 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype* restrict c_begin = c; \
ctype* restrict p_begin = p; \
ctype* restrict kappa_cast = kappa; \
ctype* restrict zero = PASTEMAC(ch,0); \
\
dim_t panel_dim; \
dim_t panel_len; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c_begin = c_begin + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( incc, ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc ); \
} \
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim, \
panel_len, \
kappa_cast, \
c_begin, incc, ldc, \
p_begin, ldp ); \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p_begin + (i )*rs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p_begin + (j )*cs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
}
INSERT_GENTFUNC_BASIC0( packm_gen_cxk )

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p \
);
INSERT_GENTPROT_BASIC( packm_gen_cxk )

View File

@@ -0,0 +1,274 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype* restrict c_begin = c; \
ctype* restrict p_begin = p; \
ctype* restrict kappa_cast = kappa; \
ctype* restrict zero = PASTEMAC(ch,0); \
\
dim_t i, j; \
dim_t panel_len; \
doff_t diagoffc_abs; \
dim_t panel_dim; \
inc_t incc, ldc; \
inc_t ldp; \
\
ctype* restrict c10; \
ctype* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
ctype* restrict c11; \
ctype* restrict p11; \
dim_t p11_m; \
dim_t p11_n; \
inc_t rs_p11, cs_p11; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_len = m_panel; \
panel_dim = n_panel; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_len = n_panel; \
panel_dim = m_panel; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
rs_p11 = 1; \
cs_p11 = cs_p; \
} \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p_begin; \
c10 = c_begin; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
p11_m = panel_dim; \
p11_n = panel_dim; \
j = diagoffc_abs; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p_begin; \
c10 = c_begin; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_begin + (j )*ldp; \
c12 = c_begin + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
p11_m = panel_dim; \
p11_n = panel_dim; \
j = diagoffc_abs; \
p11 = p_begin + (j )*ldp; \
c11 = c_begin + (j )*ldc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
\
/* Pack to P10. For upper storage, this includes the unstored
triangle of C11. */ \
PASTEMAC(ch,packm_cxk)( conjc10, \
p10_dim, \
p10_len, \
kappa_cast, \
c10, incc10, ldc10, \
p10, ldp ); \
\
/* Pack to P12. For lower storage, this includes the unstored
triangle of C11. */ \
PASTEMAC(ch,packm_cxk)( conjc12, \
p12_dim, \
p12_len, \
kappa_cast, \
c12, incc12, ldc12, \
p12, ldp ); \
\
/* Pack the stored triangule of C11 to P11. */ \
PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
uploc, \
conjc, \
p11_m, \
p11_n, \
kappa_cast, \
c11, rs_c, cs_c, \
p11, rs_p11, cs_p11 ); \
\
/* If source matrix C is Hermitian, we have to zero out the
imaginary components of the diagonal of P11 in case the
corresponding elements in C11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
/* NOTE: We can directly increment p11 since we are done
using p11 for the remainder of the function. */ \
for ( i = 0; i < p11_m; ++i ) \
{ \
PASTEMAC(ch,seti0s)( *p11 ); \
\
p11 += rs_p11 + cs_p11; \
} \
} \
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p_begin + (i )*rs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p_begin + (j )*cs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
}
INSERT_GENTFUNC_BASIC0( packm_herm_cxk )

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p \
);
INSERT_GENTPROT_BASIC( packm_herm_cxk )

View File

@@ -0,0 +1,210 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* kappa, \
ctype* c, inc_t rs_c, inc_t cs_c, \
ctype* p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype* restrict c_begin = c; \
ctype* restrict p_begin = p; \
ctype* restrict kappa_cast = kappa; \
ctype* restrict zero = PASTEMAC(ch,0); \
\
dim_t panel_dim; \
dim_t panel_len; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim, \
panel_len, \
kappa_cast, \
c_begin, incc, ldc, \
p_begin, ldp ); \
\
/* If the diagonal of C is implicitly unit, set the diagonal of
the packed panel to unit. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \
m_panel, \
n_panel, \
kappa_cast, \
p_begin, rs_p, cs_p ); \
} \
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC(ch,invertd_unb_var1)( diagoffp, \
m_panel, \
n_panel, \
p_begin, rs_p, cs_p ); \
} \
\
/* Set the region opposite the diagonal of P to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). */ \
{ \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
\
PASTEMAC2(ch,ch,setm_unb_var1)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero, \
p_begin, rs_p, cs_p ); \
} \
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p_begin + (i )*rs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p_begin + (j )*cs_p; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype* one = PASTEMAC(ch,1); \
ctype* p_edge = p_begin + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC2(ch,ch,setd_unb_var1)( 0, \
m_br, \
n_br, \
one, \
p_edge, rs_p, cs_p ); \
/*
PASTEMAC(ch,fprintm)( stdout, "packm_var3: setting br unit diag", m_br, n_br, \
p_edge, rs_p, cs_p, "%4.1f", "" ); \
*/ \
} \
/*
if ( rs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", m_panel_max, n_panel_max, \
p_begin, rs_p, cs_p, "%4.1f", "" ); \
if ( cs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", m_panel_max, n_panel_max, \
p_begin, rs_p, cs_p, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( packm_tri_cxk )

View File

@@ -0,0 +1,55 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* kappa, \
ctype* c, inc_t rs_c, inc_t cs_c, \
ctype* p, inc_t rs_p, inc_t cs_p \
);
INSERT_GENTPROT_BASIC( packm_tri_cxk )