Reorganized packm implementation.

Details:
- Reorganized packm variants and structure-aware kernels so that all
  routines for a given pack format (4m, 3m, regular) reside in a single
  file.
- Renamed _blk_var4 to _blk_var2 and generalized so that it will work
  for
  both 4m and 3m, and adjusted 4m/3m _cntl_init() functions accordingly.
- Added a new packm_ker_t function pointer type to
  bli_kernel_type_defs.h
  to facilitate function pointer typecasting in the datatype-specific
  packm_blk_var2() functions.
- Deprecated _blk_var3.
- Fixed a bug in the triangular micro-panel packing facility that
  affected trmm and trmm3 with unit diagonals.
This commit is contained in:
Field G. Van Zee
2014-08-30 16:21:20 -05:00
parent c6793cecb7
commit f032ba9b11
35 changed files with 2251 additions and 2881 deletions

View File

@@ -42,12 +42,11 @@
#include "bli_packm_unb_var1.h"
#include "bli_packm_blk_var1.h"
#include "bli_packm_blk_var3.h"
#include "bli_packm_blk_var4.h"
#include "bli_packm_blk_var2.h"
#include "bli_packm_gen_cxk.h"
#include "bli_packm_herm_cxk.h"
#include "bli_packm_tri_cxk.h"
#include "bli_packm_struc_cxk.h"
#include "bli_packm_struc_cxk_4m.h"
#include "bli_packm_struc_cxk_3m.h"
#include "bli_packm_cxk.h"
#include "bli_packm_cxk_4m.h"

View File

@@ -54,11 +54,16 @@ typedef void (*FUNCPTR_T)(
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
packm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
extern func_t* packm_struc_cxk_kers;
extern func_t* packm_struc_cxk_4m_kers;
extern func_t* packm_struc_cxk_3m_kers;
void bli_packm_blk_var1( obj_t* c,
obj_t* p,
@@ -93,6 +98,9 @@ void bli_packm_blk_var1( obj_t* c,
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// This variant assumes that the micro-kernel will always apply the
@@ -101,6 +109,13 @@ void bli_packm_blk_var1( obj_t* c,
// scale during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
// Choose the correct func_t object.
packm_kers = packm_struc_cxk_kers;
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
@@ -123,12 +138,13 @@ void bli_packm_blk_var1( obj_t* c,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p,
packm_ker,
t );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
#define GENTFUNC( ctype, ch, varname, kertype ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
@@ -148,9 +164,12 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
packm_thrinfo_t* thread \
) \
{ \
PASTECH(ch,kertype) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
@@ -301,7 +320,7 @@ void PASTEMAC(ch,varname)( \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_tri_cxk() because we need to know the value of
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
@@ -334,22 +353,24 @@ void PASTEMAC(ch,varname)( \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk)( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
}\
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
\
/* NOTE: This value is usually LESS than ps_p. */ \
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ldp * panel_len_max_i; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
@@ -363,17 +384,19 @@ void PASTEMAC(ch,varname)( \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk)( strucc, \
diagoffc_i, \
uploc, \
conjc, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
@@ -390,17 +413,19 @@ void PASTEMAC(ch,varname)( \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
conjc, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
/*
if ( row_stored ) \
@@ -420,5 +445,5 @@ void PASTEMAC(ch,varname)( \
} \
}
INSERT_GENTFUNC_BASIC0( packm_blk_var1 )
INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )

View File

@@ -58,6 +58,7 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
packm_thrinfo_t* thread \
);

View File

@@ -54,13 +54,18 @@ typedef void (*FUNCPTR_T)(
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
packm_thrinfo_t* thread
);
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4);
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2);
extern func_t* packm_struc_cxk_kers;
extern func_t* packm_struc_cxk_4m_kers;
extern func_t* packm_struc_cxk_3m_kers;
void bli_packm_blk_var4( obj_t* c,
void bli_packm_blk_var2( obj_t* c,
obj_t* p,
packm_thrinfo_t* t )
{
@@ -95,9 +100,13 @@ void bli_packm_blk_var4( obj_t* c,
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
/*
// We want this variant to behave identically to that of variant 1
// in the real domain.
if ( bli_is_real( dt_cp ) )
@@ -105,6 +114,7 @@ void bli_packm_blk_var4( obj_t* c,
bli_packm_blk_var1( c, p, t );
return;
}
*/
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
@@ -140,11 +150,20 @@ void bli_packm_blk_var4( obj_t* c,
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
// Choose the correct func_t object based on the pack_t schema.
if ( bli_is_4m_packed( schema ) ) packm_kers = packm_struc_cxk_4m_kers;
else if ( bli_is_3m_packed( schema ) ) packm_kers = packm_struc_cxk_3m_kers;
else packm_kers = packm_struc_cxk_kers;
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
//f = ftypes[dt_cp];
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var4;
else f = bli_zpackm_blk_var4;
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var2;
else f = bli_zpackm_blk_var2;
// Invoke the function.
f( strucc,
@@ -164,12 +183,13 @@ void bli_packm_blk_var4( obj_t* c,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p,
packm_ker,
t );
}
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kertype ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
@@ -189,9 +209,12 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
packm_thrinfo_t* thread \
) \
{ \
PASTECH(ch,kertype) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
@@ -224,6 +247,7 @@ void PASTEMAC(ch,varname)( \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
dim_t ss_p; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
@@ -294,6 +318,15 @@ void PASTEMAC(ch,varname)( \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the "storage stride" of p. This is usually equal to ldp,
because usually ps_p = ldp * panel_len_max (e.g. where ldp is
equal to rs_p = packnr, or cs_p = packmr). But for 3m, the product
ldp * panel_len_max must be scaled by 3/2. packm_init() has already
scaled ps_p by 3/2, if needed, so rather than scale the product by
3/2 manually, we just compute the correct scaling factor and use it
instead of ldp. */ \
ss_p = ps_p / panel_len_max; \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -342,7 +375,7 @@ void PASTEMAC(ch,varname)( \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_tri_cxk() because we need to know the value of
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
@@ -375,41 +408,25 @@ void PASTEMAC(ch,varname)( \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_4m)( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
\
/* NOTE: This value is usually LESS than ps_p. */ \
p_inc = ldp * panel_len_max_i; \
\
/*
if ( rs_p == 1 ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
/*
if ( cs_p == 1 ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ss_p * panel_len_max_i; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
@@ -422,21 +439,23 @@ void PASTEMAC(ch,varname)( \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_4m)( strucc, \
diagoffc_i, \
uploc, \
conjc, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
p_inc = ss_p * panel_len_max_i; \
} \
else \
{ \
@@ -449,37 +468,39 @@ void PASTEMAC(ch,varname)( \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_4m)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
conjc, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
p_inc = ss_p * panel_len_max_i; \
} \
\
\
@@ -488,5 +509,5 @@ void PASTEMAC(ch,varname)( \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_blk_var4 )
INSERT_GENTFUNCCO_BASIC( packm_blk_var2, packm_ker_t )

View File

@@ -32,7 +32,7 @@
*/
void bli_packm_blk_var3( obj_t* c,
void bli_packm_blk_var2( obj_t* c,
obj_t* p,
packm_thrinfo_t* t );
@@ -58,8 +58,9 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* thread \
void* packm_ker, \
packm_thrinfo_t* t \
);
INSERT_GENTPROTCO_BASIC( packm_blk_var3 )
INSERT_GENTPROTCO_BASIC( packm_blk_var2 )

View File

@@ -1,477 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p,
packm_thrinfo_t* thread
);
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
void bli_packm_blk_var3( obj_t* c,
obj_t* p,
packm_thrinfo_t* t )
{
num_t dt_cp = bli_obj_datatype( *c );
struc_t strucc = bli_obj_struc( *c );
doff_t diagoffc = bli_obj_diag_offset( *c );
diag_t diagc = bli_obj_diag( *c );
uplo_t uploc = bli_obj_uplo( *c );
trans_t transc = bli_obj_conjtrans_status( *c );
pack_t schema = bli_obj_pack_status( *p );
bool_t invdiag = bli_obj_has_inverted_diag( *p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( *p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p );
dim_t m_p = bli_obj_length( *p );
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
inc_t cs_c = bli_obj_col_stride( *c );
void* buf_p = bli_obj_buffer_at_off( *p );
inc_t rs_p = bli_obj_row_stride( *p );
inc_t cs_p = bli_obj_col_stride( *p );
dim_t pd_p = bli_obj_panel_dim( *p );
inc_t ps_p = bli_obj_panel_stride( *p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
FUNCPTR_T f;
// We want this variant to behave identically to that of variant 1
// in the real domain.
if ( bli_is_real( dt_cp ) )
{
bli_packm_blk_var1( c, p, t );
return;
}
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing complex domain micro-kernels in terms of their
// real domain counterparts. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( thread_am_ochief( t ) )
{
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = κ
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
// Index into the type combination array to extract the correct
// function pointer.
//f = ftypes[dt_cp];
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var3;
else f = bli_zpackm_blk_var3;
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p,
t );
}
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t num_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( rs_c, cs_c ); \
bli_negate_diag_offset( diagoffc ); \
bli_toggle_uplo( uploc ); \
bli_toggle_trans( transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (num_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = num_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_tri_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
\
\
/* NOTE: This value is usually LESS than (ps_p*3)/2. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
\
/*
if ( cs_p == 1 ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \
diagoffc_i, \
uploc, \
conjc, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to (ps_p*3)/2. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_3m)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
conjc, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to (ps_p*3)/2. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
\
} \
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
\
p_begin += p_inc; \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_blk_var3 )

View File

@@ -1,65 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packm_blk_var4( obj_t* c,
obj_t* p,
packm_thrinfo_t* t );
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* t \
);
INSERT_GENTPROTCO_BASIC( packm_blk_var4 )

View File

@@ -37,6 +37,10 @@
blksz_t* packm_mult_ldim;
blksz_t* packm_mult_nvec;
func_t* packm_struc_cxk_kers;
func_t* packm_struc_cxk_4m_kers;
func_t* packm_struc_cxk_3m_kers;
packm_t* packm_cntl_row;
packm_t* packm_cntl_col;
@@ -47,6 +51,30 @@ packm_t* packm_cntl;
void bli_packm_cntl_init()
{
// Create function pointer object for each datatype-specific packm
// kernel.
packm_struc_cxk_kers
=
bli_func_obj_create( bli_spackm_struc_cxk, FALSE,
bli_dpackm_struc_cxk, FALSE,
bli_cpackm_struc_cxk, FALSE,
bli_zpackm_struc_cxk, FALSE );
packm_struc_cxk_4m_kers
=
bli_func_obj_create( NULL, FALSE,
NULL, FALSE,
bli_cpackm_struc_cxk_4m, FALSE,
bli_zpackm_struc_cxk_4m, FALSE );
packm_struc_cxk_3m_kers
=
bli_func_obj_create( NULL, FALSE,
NULL, FALSE,
bli_cpackm_struc_cxk_3m, FALSE,
bli_zpackm_struc_cxk_3m, FALSE );
// Create blocksize objects for m and n register blocking. We will attach
// these to the packm control node so they can be used to (a) allocate a
// block whose m and n dimension are multiples of mr and nr, and (b) know
@@ -119,6 +147,10 @@ void bli_packm_cntl_init()
void bli_packm_cntl_finalize()
{
bli_func_obj_free( packm_struc_cxk_kers );
bli_func_obj_free( packm_struc_cxk_4m_kers );
bli_func_obj_free( packm_struc_cxk_3m_kers );
bli_cntl_obj_free( packm_cntl_row );
bli_cntl_obj_free( packm_cntl_col );

View File

@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)(
dim_t n,
void* kappa,
void* a, inc_t inca, inc_t lda,
void* p, inc_t psp, inc_t ldp
void* p, inc_t is_p, inc_t ldp
);
#undef FUNCPTR_ARRAY_LENGTH
@@ -158,7 +158,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
dim_t panel_dim; \
@@ -187,7 +187,7 @@ void PASTEMAC(ch,varname)( \
n, \
kappa, \
a, inca, lda, \
p, psp, ldp ); \
p, is_p, ldp ); \
} \
else \
{ \
@@ -196,8 +196,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict a_r = ( ctype_r* )a; \
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
ctype_r* restrict p_r = ( ctype_r* )p; \
ctype_r* restrict p_i = ( ctype_r* )p + psp; \
ctype_r* restrict p_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict p_i = ( ctype_r* )p + is_p; \
ctype_r* restrict p_rpi = ( ctype_r* )p + 2*is_p; \
const dim_t inca2 = 2*inca; \
const dim_t lda2 = 2*lda; \
\

View File

@@ -44,7 +44,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
);
INSERT_GENTPROTCO_BASIC( packm_cxk_3m )

View File

@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)(
dim_t n,
void* kappa,
void* a, inc_t inca, inc_t lda,
void* p, inc_t psp, inc_t ldp
void* p, inc_t is_p, inc_t ldp
);
#undef FUNCPTR_ARRAY_LENGTH
@@ -159,7 +159,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
dim_t panel_dim; \
@@ -188,7 +188,7 @@ void PASTEMAC(ch,varname)( \
n, \
kappa, \
a, inca, lda, \
p, psp, ldp ); \
p, is_p, ldp ); \
} \
else \
{ \
@@ -197,7 +197,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict a_r = ( ctype_r* )a; \
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
ctype_r* restrict p_r = ( ctype_r* )p; \
ctype_r* restrict p_i = ( ctype_r* )p + psp; \
ctype_r* restrict p_i = ( ctype_r* )p + is_p; \
const dim_t inca2 = 2*inca; \
const dim_t lda2 = 2*lda; \
\

View File

@@ -44,7 +44,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
);
INSERT_GENTPROTCO_BASIC( packm_cxk_4m )

View File

@@ -1,401 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
\
dim_t panel_dim; \
dim_t panel_len; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp ); \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p + (i )*rs_p; \
\
PASTEMAC(ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p + (j )*cs_p; \
\
PASTEMAC(ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
}
INSERT_GENTFUNC_BASIC0( packm_gen_cxk )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
\
dim_t panel_dim; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t psp, ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the panel stride (ie: the element offset to the imaginary
panel). */ \
psp = ldp * panel_len_max; \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk_4m)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, psp, ldp ); \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_4m )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
\
dim_t panel_dim; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t psp, ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the panel stride (ie: the element offset to the imaginary
panel). */ \
psp = ldp * panel_len_max; \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk_3m)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, psp, ldp ); \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p ); \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_3m )

File diff suppressed because it is too large Load Diff

View File

@@ -44,9 +44,9 @@ static FUNCPTR_T vars[6][3] =
{
// unblocked optimized unblocked blocked
{ bli_packm_unb_var1, NULL, bli_packm_blk_var1 },
{ NULL, NULL, bli_packm_blk_var2 },
{ NULL, NULL, NULL, },
{ NULL, NULL, NULL, },
{ NULL, NULL, bli_packm_blk_var3 },
{ NULL, NULL, bli_packm_blk_var4 },
{ NULL, NULL, NULL, },
{ NULL, NULL, NULL, },
};

View File

@@ -0,0 +1,511 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
dim_t panel_dim; \
dim_t panel_len; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp ); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk)( strucc, \
diagoffc, \
uploc, \
conjc, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp ); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk)( strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp ); \
} \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p + (i )*rs_p; \
\
PASTEMAC(ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p + (j )*cs_p; \
\
PASTEMAC(ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype* p_br = p + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC(ch,setd)( 0, \
m_br, \
n_br, \
one, \
p_br, rs_p, cs_p ); \
} \
} \
}
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp \
) \
{ \
doff_t diagoffc_abs; \
dim_t i, j; \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Since we don't
have the schema in scope, we must use the dimensions and strides
of the micro-panel to determine whether it is row- or column-
stored. */ \
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( incc, ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp ); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
{ \
ctype* restrict c10; \
ctype* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername)( conjc10, \
p10_dim, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
p10, ldp ); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername)( conjc12, \
p12_dim, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
p12, ldp ); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
ctype* restrict c11; \
ctype* restrict p11; \
dim_t p11_m; \
dim_t p11_n; \
inc_t rs_p11; \
inc_t cs_p11; \
\
p11_m = panel_dim; \
p11_n = panel_dim; \
j = diagoffc_abs; \
p11 = p + (j )*ldp; \
c11 = c + (j )*ldc; \
\
/* Compute the row and column strides of p11. */ \
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
\
\
PASTEMAC(ch,scal2m)( 0, \
BLIS_NONUNIT_DIAG, \
uploc, \
conjc, \
p11_m, \
p11_n, \
kappa, \
c11, rs_c, cs_c, \
p11, rs_p11, cs_p11 ); \
\
/* If source matrix c is Hermitian, we have to zero out the
imaginary components of the diagonal of p11 in case the
corresponding elements in c11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
/* NOTE: We can directly increment p11 since we are done
using p11 for the remainder of the function. */ \
for ( i = 0; i < p11_m; ++i ) \
{ \
PASTEMAC(ch,seti0s)( *p11 ); \
\
p11 += rs_p11 + cs_p11; \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp \
) \
{ \
/* Pack the panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp ); \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC(ch,setd)( diagoffp, \
m_panel, \
n_panel, \
kappa, \
p, rs_p, cs_p ); \
} \
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC(ch,invertd)( diagoffp, \
m_panel, \
n_panel, \
p, rs_p, cs_p ); \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
\
PASTEMAC(ch,setm)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero, \
p, rs_p, cs_p ); \
} \
\
\
}
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )

View File

@@ -51,15 +51,41 @@ void PASTEMAC(ch,varname)( \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
);
INSERT_GENTPROT_BASIC( packm_tri_cxk )
INSERT_GENTPROT_BASIC( packm_struc_cxk )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp \
);
INSERT_GENTPROT_BASIC( packm_herm_cxk )
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
@@ -68,12 +94,14 @@ void PASTEMAC(ch,varname)( \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp \
);
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_4m )
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_3m )
INSERT_GENTPROT_BASIC( packm_tri_cxk )

View File

@@ -0,0 +1,688 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
dim_t panel_dim; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t is_p, ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the imaginary stride (ie: the element offset to the imaginary
panel). */ \
is_p = ldp * panel_len_max; \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, is_p, ldp ); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \
diagoffc, \
uploc, \
conjc, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp ); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp ); \
} \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (i )*rs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (j )*cs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p ); \
} \
\
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
one_r, \
p_br_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
zero_r, \
p_br_i, rs_p, cs_p ); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_3m, packm_cxk_3m )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
) \
{ \
doff_t diagoffc_abs; \
dim_t i, j; \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Since we don't
have the schema in scope, we must use the dimensions and strides
of the micro-panel to determine whether it is row- or column-
stored. */ \
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
\
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( incc, ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, is_p, ldp ); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
{ \
ctype_r* restrict p_r = ( ctype_r* )p; \
\
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
\
ctype* restrict c10; \
ctype_r* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype_r* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p_r; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p_r; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername)( conjc10, \
p10_dim, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
p10, is_p, ldp ); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername)( conjc12, \
p12_dim, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
p12, is_p, ldp ); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
inc_t cs_p11; \
inc_t rs_p11; \
dim_t p11_m = panel_dim; \
dim_t p11_n = panel_dim; \
inc_t rs_c11 = 2*rs_c; \
inc_t cs_c11 = 2*cs_c; \
dim_t j = diagoffc_abs; \
ctype* c11 = ( ctype* )c + (j )*ldc; \
ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \
ctype_r* c11_r = ( ctype_r* )c11; \
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
ctype_r* p11_r = ( ctype_r* )p11; \
ctype_r* p11_i = ( ctype_r* )p11 + is_p; \
ctype_r* alpha_r = one_r; \
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
/* Compute the row and column strides of p11. */ \
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
\
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
PASTEMAC(chr,scal2m)( 0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_r, \
c11_r, rs_c11, cs_c11, \
p11_r, rs_p11, cs_p11 ); \
\
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
scaling by -1 if conjugation on c was requested. */ \
PASTEMAC(chr,scal2m)( 0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_i, \
c11_i, rs_c11, cs_c11, \
p11_i, rs_p11, cs_p11 ); \
\
/* If source matrix c is Hermitian, we have to zero out the
imaginary components of the diagonal of p11 in case the
corresponding elements in c11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
for ( i = 0; i < p11_m; ++i ) \
{ \
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \
\
PASTEMAC(chr,set0s)( *pi11_i ); \
} \
} \
\
/* Apply kappa to the part of p11 that corresponds to the stored
part of c11 that was copied above. */ \
if ( bli_is_upper( uploc ) ) \
{ \
PASTEMAC(ch,scalris_mxn_u)( 0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p11, cs_p11 ); \
} \
else \
{ \
PASTEMAC(ch,scalris_mxn_l)( 0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p11, cs_p11 ); \
} \
\
/* Update the p11 section of the ri panel. It simply needs
to contain the sum of p11_r + p11_i. */ \
{ \
ctype_r* p11_rpi = p11_i + is_p; \
\
for ( j = 0; j < p11_n; ++j ) \
for ( i = 0; i < p11_m; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \
ctype_r* pi11_rpi = p11_rpi + (i )*rs_p11 + (j )*cs_p11; \
\
PASTEMAC(chr,add3s)( *pi11_r, \
*pi11_i, \
*pi11_rpi ); \
} \
} \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_3m, packm_cxk_3m )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
) \
{ \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Since we don't
have the schema in scope, we must use the dimensions and strides
of the micro-panel to determine whether it is row- or column-
stored. */ \
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, is_p, ldp ); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
ctype_r* p11_rpi = ( ctype_r* )p + 2*is_p + (j )*ldp; \
inc_t cs_p11; \
inc_t rs_p11; \
\
/* Compute the row and column strides of p11. */ \
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
dim_t i; \
\
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_i, \
p11_i, rs_p11, cs_p11 ); \
\
/* Update the diagonal of the p11 section of the rpi panel.
It simply needs to contain the sum of diagonals of p11_r
and p11_i. */ \
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(chr,add3s)( *pi11_r, *pi11_i, *pi11_rpi ); \
} \
} \
\
/* If requested, invert the diagonal of the packed panel. Note
that we do not need to update the ri panel since inverted
diagonals are only needed by trsm, which does not use the
p11 section of the ri panel. */ \
if ( invdiag == TRUE ) \
{ \
dim_t i; \
\
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
} \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop11 = uploc; \
doff_t diagoffp11 = 0; \
\
bli_toggle_uplo( uplop11 ); \
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
\
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_i, rs_p11, cs_p11 ); \
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_rpi, rs_p11, cs_p11 ); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_3m, packm_cxk_3m )

View File

@@ -32,14 +32,16 @@
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
@@ -49,7 +51,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
);
INSERT_GENTPROT_BASIC( packm_herm_cxk )
INSERT_GENTPROTCO_BASIC( packm_struc_cxk_3m )
@@ -65,11 +67,41 @@ void PASTEMAC(ch,varname)( \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
);
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_4m )
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_3m )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
);
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_3m )

View File

@@ -0,0 +1,638 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
dim_t panel_dim; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t is_p, ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the imaginary stride (ie: the element offset to the imaginary
panel). */ \
is_p = ldp * panel_len_max; \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, is_p, ldp ); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk_4m)( strucc, \
diagoffc, \
uploc, \
conjc, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp ); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk_4m)( strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_len, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp ); \
} \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
} \
\
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
one_r, \
p_br_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
zero_r, \
p_br_i, rs_p, cs_p ); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_4m, packm_cxk_4m )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
) \
{ \
doff_t diagoffc_abs; \
dim_t i, j; \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Since we don't
have the schema in scope, we must use the dimensions and strides
of the micro-panel to determine whether it is row- or column-
stored. */ \
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
\
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( incc, ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, is_p, ldp ); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
{ \
ctype_r* restrict p_r = ( ctype_r* )p; \
\
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
\
ctype* restrict c10; \
ctype_r* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype_r* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p_r; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p_r; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername)( conjc10, \
p10_dim, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
p10, is_p, ldp ); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
PASTEMAC(ch,kername)( conjc12, \
p12_dim, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
p12, is_p, ldp ); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
inc_t cs_p11; \
inc_t rs_p11; \
dim_t p11_m = panel_dim; \
dim_t p11_n = panel_dim; \
inc_t rs_c11 = 2*rs_c; \
inc_t cs_c11 = 2*cs_c; \
dim_t j = diagoffc_abs; \
ctype* c11 = ( ctype* )c + (j )*ldc; \
ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \
ctype_r* c11_r = ( ctype_r* )c11; \
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
ctype_r* p11_r = ( ctype_r* )p11; \
ctype_r* p11_i = ( ctype_r* )p11 + is_p; \
ctype_r* alpha_r = one_r; \
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
/* Compute the row and column strides of p11. */ \
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
\
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
PASTEMAC(chr,scal2m)( 0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_r, \
c11_r, rs_c11, cs_c11, \
p11_r, rs_p11, cs_p11 ); \
\
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
scaling by -1 if conjugation on c was requested. */ \
PASTEMAC(chr,scal2m)( 0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_i, \
c11_i, rs_c11, cs_c11, \
p11_i, rs_p11, cs_p11 ); \
\
/* If source matrix c is Hermitian, we have to zero out the
imaginary components of the diagonal of p11 in case the
corresponding elements in c11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
for ( i = 0; i < p11_m; ++i ) \
{ \
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \
\
PASTEMAC(chr,set0s)( *pi11_i ); \
} \
} \
\
/* Apply kappa to the part of p11 that corresponds to the stored
part of c11 that was copied above. */ \
if ( bli_is_upper( uploc ) ) \
{ \
PASTEMAC(ch,scalris_mxn_u)( 0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p11, cs_p11 ); \
} \
else \
{ \
PASTEMAC(ch,scalris_mxn_l)( 0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p11, cs_p11 ); \
} \
/*
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \
p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \
p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \
*/ \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_4m, packm_cxk_4m )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
) \
{ \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Since we don't
have the schema in scope, we must use the dimensions and strides
of the micro-panel to determine whether it is row- or column-
stored. */ \
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, is_p, ldp ); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
inc_t cs_p11; \
inc_t rs_p11; \
\
/* Compute the row and column strides of p11. */ \
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_i, \
p11_i, rs_p11, cs_p11 ); \
} \
\
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
dim_t i; \
\
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
} \
} \
\
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop11 = uploc; \
doff_t diagoffp11 = 0; \
\
bli_toggle_uplo( uplop11 ); \
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
\
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_i, rs_p11, cs_p11 ); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_4m, packm_cxk_4m )

View File

@@ -32,14 +32,16 @@
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
@@ -49,7 +51,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
);
INSERT_GENTPROT_BASIC( packm_gen_cxk )
INSERT_GENTPROTCO_BASIC( packm_struc_cxk_4m )
@@ -65,11 +67,41 @@ void PASTEMAC(ch,varname)( \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
);
INSERT_GENTPROTCO_BASIC( packm_gen_cxk_4m )
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_4m )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp \
);
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_4m )
INSERT_GENTPROTCO_BASIC( packm_gen_cxk_3m )

View File

@@ -1,720 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
\
dim_t panel_dim; \
dim_t panel_len; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp ); \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC(ch,setd)( diagoffp, \
m_panel, \
n_panel, \
kappa, \
p, rs_p, cs_p ); \
} \
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC(ch,invertd)( diagoffp, \
m_panel, \
n_panel, \
p, rs_p, cs_p ); \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
\
PASTEMAC(ch,setm)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero, \
p, rs_p, cs_p ); \
} \
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p + (i )*rs_p; \
\
PASTEMAC(ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p + (j )*cs_p; \
\
PASTEMAC(ch,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p ); \
} \
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype* one = PASTEMAC(ch,1); \
ctype* p_br = p + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC(ch,setd)( 0, \
m_br, \
n_br, \
one, \
p_br, rs_p, cs_p ); \
} \
/*
PASTEMAC(ch,fprintm)( stdout, "packm_var1: setting br unit diag", m_br, n_br, \
p_edge, rs_p, cs_p, "%4.1f", "" ); \
*/ \
/*
if ( rs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
if ( cs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( packm_tri_cxk )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
ctype_r* restrict one_r = PASTEMAC(chr,1); \
\
dim_t i; \
dim_t panel_dim; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t psp, ldp; \
\
inc_t rs_p11, cs_p11; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
rs_p11 = 1; \
cs_p11 = cs_p; \
} \
\
/* Compute the panel stride (ie: the element offset to the imaginary
panel). */ \
psp = ldp * panel_len_max; \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk_4m)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, psp, ldp ); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_i, \
p11_i, rs_p11, cs_p11 ); \
} \
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
} \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
uplo_t uplop11 = uploc; \
doff_t diagoffp11 = 0; \
\
bli_toggle_uplo( uplop11 ); \
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
\
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_i, rs_p11, cs_p11 ); \
} \
} \
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
\
} \
\
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
one_r, \
p_br_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
zero_r, \
p_br_i, rs_p, cs_p ); \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_4m )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
ctype_r* restrict one_r = PASTEMAC(chr,1); \
\
dim_t i; \
dim_t panel_dim; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t psp, ldp; \
\
inc_t rs_p11, cs_p11; \
\
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
rs_p11 = 1; \
cs_p11 = cs_p; \
} \
\
/* Compute the panel stride (ie: the element offset to the imaginary
panel). */ \
psp = ldp * panel_len_max; \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,packm_cxk_3m)( conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, psp, ldp ); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \
ctype_r* p11_rpi = ( ctype_r* )p + 2*psp + (j )*ldp; \
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_i, \
p11_i, rs_p11, cs_p11 ); \
PASTEMAC(chr,setd)( 0, \
m_panel, \
n_panel, \
&kappa_r, \
p11_rpi, rs_p11, cs_p11 ); \
} \
\
/* If requested, invert the diagonal of the packed panel. Note
that we do not need to update the ri panel since inverted
diagonals are only needed by trsm, which does not use the
p11 section of the ri panel. */ \
if ( invdiag == TRUE ) \
{ \
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
} \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
uplo_t uplop11 = uploc; \
doff_t diagoffp11 = 0; \
\
bli_toggle_uplo( uplop11 ); \
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
\
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_r, rs_p11, cs_p11 ); \
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_i, rs_p11, cs_p11 ); \
PASTEMAC(chr,setm)( diagoffp11, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
zero_r, \
p11_rpi, rs_p11, cs_p11 ); \
} \
} \
\
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
either region exists, we set them to zero. This allows the
micro-kernel to remain simple since it does not need to support
different register blockings for the edge cases. */ \
if ( m_panel != m_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p ); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \
\
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( 0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p ); \
} \
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
one_r, \
p_br_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( 0, \
m_br, \
n_br, \
zero_r, \
p_br_i, rs_p, cs_p ); \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_3m )

View File

@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -54,8 +54,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -133,7 +133,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -145,8 +145,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -232,7 +232,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -244,8 +244,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -339,7 +339,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -351,8 +351,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -454,7 +454,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -466,8 +466,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -577,7 +577,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -589,8 +589,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -708,7 +708,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -720,8 +720,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -847,7 +847,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -859,8 +859,8 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \

View File

@@ -40,7 +40,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
);
INSERT_GENTPROT_BASIC( packm_ref_2xk_3m )

View File

@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -128,7 +128,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -140,7 +140,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -222,7 +222,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -234,7 +234,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -324,7 +324,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -336,7 +336,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -434,7 +434,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -446,7 +446,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -552,7 +552,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -564,7 +564,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -678,7 +678,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -690,7 +690,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
@@ -812,7 +812,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
@@ -824,7 +824,7 @@ void PASTEMAC(ch,varname)( \
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
ctype_r* restrict pi1_r = ( ctype_r* )p; \
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
\
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \

View File

@@ -40,7 +40,7 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
void* kappa, \
void* a, inc_t inca, inc_t lda, \
void* p, inc_t psp, inc_t ldp \
void* p, inc_t is_p, inc_t ldp \
);
INSERT_GENTPROT_BASIC( packm_ref_2xk_4m )

View File

@@ -119,7 +119,7 @@ void bli_gemm3m_cntl_init()
gemm3m_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
gemm3m_mr,
gemm3m_kr,
TRUE, // densify; used by hemm/symm
@@ -132,7 +132,7 @@ void bli_gemm3m_cntl_init()
gemm3m_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
gemm3m_kr,
gemm3m_nr,
TRUE, // densify; used by hemm/symm

View File

@@ -119,7 +119,7 @@ void bli_gemm4m_cntl_init()
gemm4m_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
gemm4m_mr,
gemm4m_kr,
TRUE, // densify; used by hemm/symm
@@ -132,7 +132,7 @@ void bli_gemm4m_cntl_init()
gemm4m_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
gemm4m_kr,
gemm4m_nr,
TRUE, // densify; used by hemm/symm

View File

@@ -62,7 +62,7 @@ void bli_herk3m_cntl_init()
herk3m_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
gemm3m_mr,
gemm3m_kr,
FALSE, // already dense; densify not necessary
@@ -75,7 +75,7 @@ void bli_herk3m_cntl_init()
herk3m_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
gemm3m_kr,
gemm3m_nr,
FALSE, // already dense; densify not necessary

View File

@@ -62,7 +62,7 @@ void bli_herk4m_cntl_init()
herk4m_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
gemm4m_mr,
gemm4m_kr,
FALSE, // already dense; densify not necessary
@@ -75,7 +75,7 @@ void bli_herk4m_cntl_init()
herk4m_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
gemm4m_kr,
gemm4m_nr,
FALSE, // already dense; densify not necessary

View File

@@ -73,7 +73,7 @@ void bli_trmm3m_cntl_init()
trmm3m_l_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to mr.
gemm3m_mr,
@@ -88,7 +88,7 @@ void bli_trmm3m_cntl_init()
trmm3m_l_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
// IMPORTANT: m dim multiple here must be mr
// since "k" dim multiple is set to mr above.
gemm3m_mr,
@@ -104,7 +104,7 @@ void bli_trmm3m_cntl_init()
trmm3m_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to nr.
gemm3m_mr,
@@ -119,7 +119,7 @@ void bli_trmm3m_cntl_init()
trmm3m_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
// IMPORTANT: m dim multiple here must be nr
// since "k" dim multiple is set to nr above.
gemm3m_nr,

View File

@@ -73,7 +73,7 @@ void bli_trmm4m_cntl_init()
trmm4m_l_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to mr.
gemm4m_mr,
@@ -88,7 +88,7 @@ void bli_trmm4m_cntl_init()
trmm4m_l_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
// IMPORTANT: m dim multiple here must be mr
// since "k" dim multiple is set to mr above.
gemm4m_mr,
@@ -104,7 +104,7 @@ void bli_trmm4m_cntl_init()
trmm4m_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to nr.
gemm4m_mr,
@@ -119,7 +119,7 @@ void bli_trmm4m_cntl_init()
trmm4m_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
// IMPORTANT: m dim multiple here must be nr
// since "k" dim multiple is set to nr above.
gemm4m_nr,

View File

@@ -92,7 +92,7 @@ void bli_trsm3m_cntl_init()
trsm3m_l_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
// IMPORTANT: n dim multiple must be mr to
// support right and bottom-right edge cases
gemm3m_mr,
@@ -107,7 +107,7 @@ void bli_trsm3m_cntl_init()
trsm3m_l_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
// IMPORTANT: m dim multiple must be mr since
// B_pack is updated (ie: serves as C) in trsm
gemm3m_mr,
@@ -123,7 +123,7 @@ void bli_trsm3m_cntl_init()
trsm3m_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
gemm3m_nr,
gemm3m_mr,
FALSE, // already dense; densify not necessary
@@ -136,7 +136,7 @@ void bli_trsm3m_cntl_init()
trsm3m_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_VARIANT2,
gemm3m_mr,
gemm3m_mr,
TRUE, // densify

View File

@@ -93,7 +93,7 @@ void bli_trsm4m_cntl_init()
trsm4m_l_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
// IMPORTANT: n dim multiple must be mr to
// support right and bottom-right edge cases
gemm4m_mr,
@@ -108,7 +108,7 @@ void bli_trsm4m_cntl_init()
trsm4m_l_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
// IMPORTANT: m dim multiple must be mr since
// B_pack is updated (ie: serves as C) in trsm
gemm4m_mr,
@@ -124,7 +124,7 @@ void bli_trsm4m_cntl_init()
trsm4m_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
gemm4m_nr,
gemm4m_mr,
FALSE, // already dense; densify not necessary
@@ -137,7 +137,7 @@ void bli_trsm4m_cntl_init()
trsm4m_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_VARIANT2,
gemm4m_mr,
gemm4m_mr,
TRUE, // densify

View File

@@ -101,5 +101,31 @@ typedef void \
INSERT_GENTPROT_BASIC( gemmtrsm_ukr_t )
// -- packm_struc_cxk kernel --
#undef GENTPROT
#define GENTPROT( ctype, ch, tname ) \
\
typedef void \
(*PASTECH(ch,tname))( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
bool_t invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p \
);
INSERT_GENTPROT_BASIC( packm_ker_t )
#endif