mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Reorganized packm implementation.
Details: - Reorganized packm variants and structure-aware kernels so that all routines for a given pack format (4m, 3m, regular) reside in a single file. - Renamed _blk_var4 to _blk_var2 and generalized so that it will work for both 4m and 3m, and adjusted 4m/3m _cntl_init() functions accordingly. - Added a new packm_ker_t function pointer type to bli_kernel_type_defs.h to facilitate function pointer typecasting in the datatype-specific packm_blk_var2() functions. - Deprecated _blk_var3. - Fixed a bug in the triangular micro-panel packing facility that affected trmm and trmm3 with unit diagonals.
This commit is contained in:
@@ -42,12 +42,11 @@
|
||||
#include "bli_packm_unb_var1.h"
|
||||
|
||||
#include "bli_packm_blk_var1.h"
|
||||
#include "bli_packm_blk_var3.h"
|
||||
#include "bli_packm_blk_var4.h"
|
||||
#include "bli_packm_blk_var2.h"
|
||||
|
||||
#include "bli_packm_gen_cxk.h"
|
||||
#include "bli_packm_herm_cxk.h"
|
||||
#include "bli_packm_tri_cxk.h"
|
||||
#include "bli_packm_struc_cxk.h"
|
||||
#include "bli_packm_struc_cxk_4m.h"
|
||||
#include "bli_packm_struc_cxk_3m.h"
|
||||
|
||||
#include "bli_packm_cxk.h"
|
||||
#include "bli_packm_cxk_4m.h"
|
||||
|
||||
@@ -54,11 +54,16 @@ typedef void (*FUNCPTR_T)(
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
void* packm_ker,
|
||||
packm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
|
||||
extern func_t* packm_struc_cxk_kers;
|
||||
extern func_t* packm_struc_cxk_4m_kers;
|
||||
extern func_t* packm_struc_cxk_3m_kers;
|
||||
|
||||
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p,
|
||||
@@ -93,6 +98,9 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
|
||||
void* buf_kappa;
|
||||
|
||||
func_t* packm_kers;
|
||||
void* packm_ker;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// This variant assumes that the micro-kernel will always apply the
|
||||
@@ -101,6 +109,13 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
// scale during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
|
||||
|
||||
// Choose the correct func_t object.
|
||||
packm_kers = packm_struc_cxk_kers;
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
|
||||
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_cp];
|
||||
@@ -123,12 +138,13 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p,
|
||||
packm_ker,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname, kertype ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
@@ -148,9 +164,12 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
packm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
PASTECH(ch,kertype) packm_ker_cast = packm_ker; \
|
||||
\
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
@@ -301,7 +320,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_tri_cxk() because we need to know the value of
|
||||
packm_struc_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
@@ -334,22 +353,24 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
}\
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p. */ \
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
@@ -363,17 +384,19 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffc_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
@@ -390,17 +413,19 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
packm_ker_cast( BLIS_GENERAL, \
|
||||
0, \
|
||||
diagc, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
@@ -420,5 +445,5 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( packm_blk_var1 )
|
||||
INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )
|
||||
|
||||
|
||||
@@ -58,6 +58,7 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
packm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
|
||||
@@ -54,13 +54,18 @@ typedef void (*FUNCPTR_T)(
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
void* packm_ker,
|
||||
packm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4);
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2);
|
||||
|
||||
extern func_t* packm_struc_cxk_kers;
|
||||
extern func_t* packm_struc_cxk_4m_kers;
|
||||
extern func_t* packm_struc_cxk_3m_kers;
|
||||
|
||||
|
||||
void bli_packm_blk_var4( obj_t* c,
|
||||
void bli_packm_blk_var2( obj_t* c,
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t )
|
||||
{
|
||||
@@ -95,9 +100,13 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
func_t* packm_kers;
|
||||
void* packm_ker;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
/*
|
||||
// We want this variant to behave identically to that of variant 1
|
||||
// in the real domain.
|
||||
if ( bli_is_real( dt_cp ) )
|
||||
@@ -105,6 +114,7 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
bli_packm_blk_var1( c, p, t );
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
@@ -140,11 +150,20 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
|
||||
|
||||
|
||||
// Choose the correct func_t object based on the pack_t schema.
|
||||
if ( bli_is_4m_packed( schema ) ) packm_kers = packm_struc_cxk_4m_kers;
|
||||
else if ( bli_is_3m_packed( schema ) ) packm_kers = packm_struc_cxk_3m_kers;
|
||||
else packm_kers = packm_struc_cxk_kers;
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
|
||||
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
//f = ftypes[dt_cp];
|
||||
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var4;
|
||||
else f = bli_zpackm_blk_var4;
|
||||
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var2;
|
||||
else f = bli_zpackm_blk_var2;
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
@@ -164,12 +183,13 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p,
|
||||
packm_ker,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kertype ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
@@ -189,9 +209,12 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
packm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
PASTECH(ch,kertype) packm_ker_cast = packm_ker; \
|
||||
\
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
@@ -224,6 +247,7 @@ void PASTEMAC(ch,varname)( \
|
||||
conj_t conjc; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
dim_t ss_p; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
@@ -294,6 +318,15 @@ void PASTEMAC(ch,varname)( \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the "storage stride" of p. This is usually equal to ldp,
|
||||
because usually ps_p = ldp * panel_len_max (e.g. where ldp is
|
||||
equal to rs_p = packnr, or cs_p = packmr). But for 3m, the product
|
||||
ldp * panel_len_max must be scaled by 3/2. packm_init() has already
|
||||
scaled ps_p by 3/2, if needed, so rather than scale the product by
|
||||
3/2 manually, we just compute the correct scaling factor and use it
|
||||
instead of ldp. */ \
|
||||
ss_p = ps_p / panel_len_max; \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
@@ -342,7 +375,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_tri_cxk() because we need to know the value of
|
||||
packm_struc_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
@@ -375,41 +408,25 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_4m)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
/*
|
||||
if ( rs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ss_p * panel_len_max_i; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
@@ -422,21 +439,23 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_4m)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffc_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
p_inc = ss_p * panel_len_max_i; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -449,37 +468,39 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_4m)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
packm_ker_cast( BLIS_GENERAL, \
|
||||
0, \
|
||||
diagc, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
p_inc = ss_p * panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
@@ -488,5 +509,5 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_blk_var4 )
|
||||
INSERT_GENTFUNCCO_BASIC( packm_blk_var2, packm_ker_t )
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
void bli_packm_blk_var2( obj_t* c,
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t );
|
||||
|
||||
@@ -58,8 +58,9 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* thread \
|
||||
void* packm_ker, \
|
||||
packm_thrinfo_t* t \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var3 )
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var2 )
|
||||
|
||||
@@ -1,477 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
pack_t schema,
|
||||
bool_t invdiag,
|
||||
bool_t revifup,
|
||||
bool_t reviflo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
packm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
|
||||
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
struc_t strucc = bli_obj_struc( *c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( *c );
|
||||
diag_t diagc = bli_obj_diag( *c );
|
||||
uplo_t uploc = bli_obj_uplo( *c );
|
||||
trans_t transc = bli_obj_conjtrans_status( *c );
|
||||
pack_t schema = bli_obj_pack_status( *p );
|
||||
bool_t invdiag = bli_obj_has_inverted_diag( *p );
|
||||
bool_t revifup = bli_obj_is_pack_rev_if_upper( *p );
|
||||
bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p );
|
||||
|
||||
dim_t m_p = bli_obj_length( *p );
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( *p );
|
||||
inc_t rs_p = bli_obj_row_stride( *p );
|
||||
inc_t cs_p = bli_obj_col_stride( *p );
|
||||
dim_t pd_p = bli_obj_panel_dim( *p );
|
||||
inc_t ps_p = bli_obj_panel_stride( *p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// We want this variant to behave identically to that of variant 1
|
||||
// in the real domain.
|
||||
if ( bli_is_real( dt_cp ) )
|
||||
{
|
||||
bli_packm_blk_var1( c, p, t );
|
||||
return;
|
||||
}
|
||||
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing complex domain micro-kernels in terms of their
|
||||
// real domain counterparts. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( thread_am_ochief( t ) )
|
||||
{
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
|
||||
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
//f = ftypes[dt_cp];
|
||||
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var3;
|
||||
else f = bli_zpackm_blk_var3;
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
schema,
|
||||
invdiag,
|
||||
revifup,
|
||||
reviflo,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict c_begin; \
|
||||
ctype* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t num_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( rs_c, cs_c ); \
|
||||
bli_negate_diag_offset( diagoffc ); \
|
||||
bli_toggle_uplo( uploc ); \
|
||||
bli_toggle_trans( transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (num_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
ip0 = num_iter - 1; \
|
||||
ip_inc = -1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip0 = 0; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_tri_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc_i < 0 ) || \
|
||||
( row_stored && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* NOTE: This value is usually LESS than (ps_p*3)/2. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
\
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to (ps_p*3)/2. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_3m)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to (ps_p*3)/2. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
\
|
||||
} \
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_blk_var3 )
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var4( obj_t* c,
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t );
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* t \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var4 )
|
||||
|
||||
@@ -37,6 +37,10 @@
|
||||
blksz_t* packm_mult_ldim;
|
||||
blksz_t* packm_mult_nvec;
|
||||
|
||||
func_t* packm_struc_cxk_kers;
|
||||
func_t* packm_struc_cxk_4m_kers;
|
||||
func_t* packm_struc_cxk_3m_kers;
|
||||
|
||||
packm_t* packm_cntl_row;
|
||||
packm_t* packm_cntl_col;
|
||||
|
||||
@@ -47,6 +51,30 @@ packm_t* packm_cntl;
|
||||
|
||||
void bli_packm_cntl_init()
|
||||
{
|
||||
// Create function pointer object for each datatype-specific packm
|
||||
// kernel.
|
||||
packm_struc_cxk_kers
|
||||
=
|
||||
bli_func_obj_create( bli_spackm_struc_cxk, FALSE,
|
||||
bli_dpackm_struc_cxk, FALSE,
|
||||
bli_cpackm_struc_cxk, FALSE,
|
||||
bli_zpackm_struc_cxk, FALSE );
|
||||
|
||||
packm_struc_cxk_4m_kers
|
||||
=
|
||||
bli_func_obj_create( NULL, FALSE,
|
||||
NULL, FALSE,
|
||||
bli_cpackm_struc_cxk_4m, FALSE,
|
||||
bli_zpackm_struc_cxk_4m, FALSE );
|
||||
|
||||
packm_struc_cxk_3m_kers
|
||||
=
|
||||
bli_func_obj_create( NULL, FALSE,
|
||||
NULL, FALSE,
|
||||
bli_cpackm_struc_cxk_3m, FALSE,
|
||||
bli_zpackm_struc_cxk_3m, FALSE );
|
||||
|
||||
|
||||
// Create blocksize objects for m and n register blocking. We will attach
|
||||
// these to the packm control node so they can be used to (a) allocate a
|
||||
// block whose m and n dimension are multiples of mr and nr, and (b) know
|
||||
@@ -119,6 +147,10 @@ void bli_packm_cntl_init()
|
||||
|
||||
void bli_packm_cntl_finalize()
|
||||
{
|
||||
bli_func_obj_free( packm_struc_cxk_kers );
|
||||
bli_func_obj_free( packm_struc_cxk_4m_kers );
|
||||
bli_func_obj_free( packm_struc_cxk_3m_kers );
|
||||
|
||||
bli_cntl_obj_free( packm_cntl_row );
|
||||
bli_cntl_obj_free( packm_cntl_col );
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
void* kappa,
|
||||
void* a, inc_t inca, inc_t lda,
|
||||
void* p, inc_t psp, inc_t ldp
|
||||
void* p, inc_t is_p, inc_t ldp
|
||||
);
|
||||
|
||||
#undef FUNCPTR_ARRAY_LENGTH
|
||||
@@ -158,7 +158,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
@@ -187,7 +187,7 @@ void PASTEMAC(ch,varname)( \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, psp, ldp ); \
|
||||
p, is_p, ldp ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -196,8 +196,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict p_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict p_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict p_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict p_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
const dim_t inca2 = 2*inca; \
|
||||
const dim_t lda2 = 2*lda; \
|
||||
\
|
||||
|
||||
@@ -44,7 +44,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_cxk_3m )
|
||||
|
||||
@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
void* kappa,
|
||||
void* a, inc_t inca, inc_t lda,
|
||||
void* p, inc_t psp, inc_t ldp
|
||||
void* p, inc_t is_p, inc_t ldp
|
||||
);
|
||||
|
||||
#undef FUNCPTR_ARRAY_LENGTH
|
||||
@@ -159,7 +159,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
@@ -188,7 +188,7 @@ void PASTEMAC(ch,varname)( \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, psp, ldp ); \
|
||||
p, is_p, ldp ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -197,7 +197,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict p_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict p_i = ( ctype_r* )p + is_p; \
|
||||
const dim_t inca2 = 2*inca; \
|
||||
const dim_t lda2 = 2*lda; \
|
||||
\
|
||||
|
||||
@@ -44,7 +44,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_cxk_4m )
|
||||
|
||||
@@ -1,401 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( packm_gen_cxk )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_4m)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_4m )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_3m)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_rpi, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_rpi, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_3m )
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -44,9 +44,9 @@ static FUNCPTR_T vars[6][3] =
|
||||
{
|
||||
// unblocked optimized unblocked blocked
|
||||
{ bli_packm_unb_var1, NULL, bli_packm_blk_var1 },
|
||||
{ NULL, NULL, bli_packm_blk_var2 },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, bli_packm_blk_var3 },
|
||||
{ NULL, NULL, bli_packm_blk_var4 },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
};
|
||||
|
||||
511
frame/1m/packm/bli_packm_struc_cxk.c
Normal file
511
frame/1m/packm/bli_packm_struc_cxk.c
Normal file
@@ -0,0 +1,511 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Handle micro-panel packing based on the structure of the matrix
|
||||
being packed. */ \
|
||||
if ( bli_is_general( strucc ) ) \
|
||||
{ \
|
||||
/* For micro-panels of general matrices, we can call the pack
|
||||
kernel front-end directly. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of Hermitian/symmetric
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_herm_cxk)( strucc, \
|
||||
diagoffc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp ); \
|
||||
} \
|
||||
else /* ( bli_is_triangular( strucc ) ) */ \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of triangular
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_tri_cxk)( strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) ) \
|
||||
{ \
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype* p_br = p + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one, \
|
||||
p_br, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t i, j; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Since we don't
|
||||
have the schema in scope, we must use the dimensions and strides
|
||||
of the micro-panel to determine whether it is row- or column-
|
||||
stored. */ \
|
||||
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
\
|
||||
/* Handle the case where the micro-panel does NOT intersect the
|
||||
diagonal separately from the case where it does intersect. */ \
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the full panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
{ \
|
||||
ctype* restrict c10; \
|
||||
ctype* restrict p10; \
|
||||
dim_t p10_dim, p10_len; \
|
||||
inc_t incc10, ldc10; \
|
||||
doff_t diagoffc10; \
|
||||
conj_t conjc10; \
|
||||
\
|
||||
ctype* restrict c12; \
|
||||
ctype* restrict p12; \
|
||||
dim_t p12_dim, p12_len; \
|
||||
inc_t incc12, ldc12; \
|
||||
doff_t diagoffc12; \
|
||||
conj_t conjc12; \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc < 0 ) || \
|
||||
( row_stored && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to p10. For upper storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, ldp ); \
|
||||
\
|
||||
/* Pack to p12. For lower storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangle of c11 to p11. */ \
|
||||
{ \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict p11; \
|
||||
dim_t p11_m; \
|
||||
dim_t p11_n; \
|
||||
inc_t rs_p11; \
|
||||
inc_t cs_p11; \
|
||||
\
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p + (j )*ldp; \
|
||||
c11 = c + (j )*ldc; \
|
||||
\
|
||||
/* Compute the row and column strides of p11. */ \
|
||||
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
|
||||
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
|
||||
\
|
||||
\
|
||||
PASTEMAC(ch,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix c is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of p11 in case the
|
||||
corresponding elements in c11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
/* NOTE: We can directly increment p11 since we are done
|
||||
using p11 for the remainder of the function. */ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,seti0s)( *p11 ); \
|
||||
\
|
||||
p11 += rs_p11 + cs_p11; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
\
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,setd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
PASTEMAC(ch,invertd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
uplo_t uplop = uploc; \
|
||||
\
|
||||
bli_toggle_uplo( uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
|
||||
\
|
||||
PASTEMAC(ch,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
|
||||
|
||||
@@ -51,15 +51,41 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_tri_cxk )
|
||||
INSERT_GENTPROT_BASIC( packm_struc_cxk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_herm_cxk )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
@@ -68,12 +94,14 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_4m )
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_3m )
|
||||
INSERT_GENTPROT_BASIC( packm_tri_cxk )
|
||||
|
||||
688
frame/1m/packm/bli_packm_struc_cxk_3m.c
Normal file
688
frame/1m/packm/bli_packm_struc_cxk_3m.c
Normal file
@@ -0,0 +1,688 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t is_p, ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the imaginary stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
is_p = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Handle micro-panel packing based on the structure of the matrix
|
||||
being packed. */ \
|
||||
if ( bli_is_general( strucc ) ) \
|
||||
{ \
|
||||
/* For micro-panels of general matrices, we can call the pack
|
||||
kernel front-end directly. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, is_p, ldp ); \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of Hermitian/symmetric
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \
|
||||
diagoffc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
is_p, ldp ); \
|
||||
} \
|
||||
else /* ( bli_is_triangular( strucc ) ) */ \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of triangular
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
is_p, ldp ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \
|
||||
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_rpi, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \
|
||||
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_rpi, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) ) \
|
||||
{ \
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
|
||||
ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one_r, \
|
||||
p_br_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
zero_r, \
|
||||
p_br_i, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_3m, packm_cxk_3m )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t i, j; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Since we don't
|
||||
have the schema in scope, we must use the dimensions and strides
|
||||
of the micro-panel to determine whether it is row- or column-
|
||||
stored. */ \
|
||||
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
\
|
||||
\
|
||||
/* Handle the case where the micro-panel does NOT intersect the
|
||||
diagonal separately from the case where it does intersect. */ \
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the full panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, is_p, ldp ); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
{ \
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
|
||||
\
|
||||
ctype* restrict c10; \
|
||||
ctype_r* restrict p10; \
|
||||
dim_t p10_dim, p10_len; \
|
||||
inc_t incc10, ldc10; \
|
||||
doff_t diagoffc10; \
|
||||
conj_t conjc10; \
|
||||
\
|
||||
ctype* restrict c12; \
|
||||
ctype_r* restrict p12; \
|
||||
dim_t p12_dim, p12_len; \
|
||||
inc_t incc12, ldc12; \
|
||||
doff_t diagoffc12; \
|
||||
conj_t conjc12; \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc < 0 ) || \
|
||||
( row_stored && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to p10. For upper storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, is_p, ldp ); \
|
||||
\
|
||||
/* Pack to p12. For lower storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, is_p, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangle of c11 to p11. */ \
|
||||
{ \
|
||||
inc_t cs_p11; \
|
||||
inc_t rs_p11; \
|
||||
dim_t p11_m = panel_dim; \
|
||||
dim_t p11_n = panel_dim; \
|
||||
inc_t rs_c11 = 2*rs_c; \
|
||||
inc_t cs_c11 = 2*cs_c; \
|
||||
dim_t j = diagoffc_abs; \
|
||||
ctype* c11 = ( ctype* )c + (j )*ldc; \
|
||||
ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \
|
||||
ctype_r* c11_r = ( ctype_r* )c11; \
|
||||
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
|
||||
ctype_r* p11_r = ( ctype_r* )p11; \
|
||||
ctype_r* p11_i = ( ctype_r* )p11 + is_p; \
|
||||
ctype_r* alpha_r = one_r; \
|
||||
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
|
||||
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
/* Compute the row and column strides of p11. */ \
|
||||
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
|
||||
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
|
||||
\
|
||||
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_r, \
|
||||
c11_r, rs_c11, cs_c11, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
|
||||
scaling by -1 if conjugation on c was requested. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_i, \
|
||||
c11_i, rs_c11, cs_c11, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix c is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of p11 in case the
|
||||
corresponding elements in c11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \
|
||||
\
|
||||
PASTEMAC(chr,set0s)( *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Apply kappa to the part of p11 that corresponds to the stored
|
||||
part of c11 that was copied above. */ \
|
||||
if ( bli_is_upper( uploc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_u)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
&kappa_r, \
|
||||
&kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_l)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
&kappa_r, \
|
||||
&kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
\
|
||||
/* Update the p11 section of the ri panel. It simply needs
|
||||
to contain the sum of p11_r + p11_i. */ \
|
||||
{ \
|
||||
ctype_r* p11_rpi = p11_i + is_p; \
|
||||
\
|
||||
for ( j = 0; j < p11_n; ++j ) \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \
|
||||
ctype_r* pi11_rpi = p11_rpi + (i )*rs_p11 + (j )*cs_p11; \
|
||||
\
|
||||
PASTEMAC(chr,add3s)( *pi11_r, \
|
||||
*pi11_i, \
|
||||
*pi11_rpi ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_3m, packm_cxk_3m )
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Since we don't
|
||||
have the schema in scope, we must use the dimensions and strides
|
||||
of the micro-panel to determine whether it is row- or column-
|
||||
stored. */ \
|
||||
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, is_p, ldp ); \
|
||||
\
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
|
||||
ctype_r* p11_rpi = ( ctype_r* )p + 2*is_p + (j )*ldp; \
|
||||
inc_t cs_p11; \
|
||||
inc_t rs_p11; \
|
||||
\
|
||||
/* Compute the row and column strides of p11. */ \
|
||||
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
|
||||
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
|
||||
\
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
dim_t i; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_i, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* Update the diagonal of the p11 section of the rpi panel.
|
||||
It simply needs to contain the sum of diagonals of p11_r
|
||||
and p11_i. */ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (i )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,add3s)( *pi11_r, *pi11_i, *pi11_rpi ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. Note
|
||||
that we do not need to update the ri panel since inverted
|
||||
diagonals are only needed by trsm, which does not use the
|
||||
p11 section of the ri panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
dim_t i; \
|
||||
\
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_rpi, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_3m, packm_cxk_3m )
|
||||
|
||||
@@ -32,14 +32,16 @@
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
@@ -49,7 +51,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_herm_cxk )
|
||||
INSERT_GENTPROTCO_BASIC( packm_struc_cxk_3m )
|
||||
|
||||
|
||||
|
||||
@@ -65,11 +67,41 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_4m )
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_3m )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_3m )
|
||||
|
||||
638
frame/1m/packm/bli_packm_struc_cxk_4m.c
Normal file
638
frame/1m/packm/bli_packm_struc_cxk_4m.c
Normal file
@@ -0,0 +1,638 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t is_p, ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the imaginary stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
is_p = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Handle micro-panel packing based on the structure of the matrix
|
||||
being packed. */ \
|
||||
if ( bli_is_general( strucc ) ) \
|
||||
{ \
|
||||
/* For micro-panels of general matrices, we can call the pack
|
||||
kernel front-end directly. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, is_p, ldp ); \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of Hermitian/symmetric
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_herm_cxk_4m)( strucc, \
|
||||
diagoffc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
is_p, ldp ); \
|
||||
} \
|
||||
else /* ( bli_is_triangular( strucc ) ) */ \
|
||||
{ \
|
||||
/* Call a helper function for micro-panels of triangular
|
||||
matrices. */ \
|
||||
PASTEMAC(ch,packm_tri_cxk_4m)( strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
is_p, ldp ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) ) \
|
||||
{ \
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
|
||||
ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one_r, \
|
||||
p_br_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
zero_r, \
|
||||
p_br_i, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_4m, packm_cxk_4m )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t i, j; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Since we don't
|
||||
have the schema in scope, we must use the dimensions and strides
|
||||
of the micro-panel to determine whether it is row- or column-
|
||||
stored. */ \
|
||||
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
\
|
||||
\
|
||||
/* Handle the case where the micro-panel does NOT intersect the
|
||||
diagonal separately from the case where it does intersect. */ \
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the full panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, is_p, ldp ); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
{ \
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
|
||||
\
|
||||
ctype* restrict c10; \
|
||||
ctype_r* restrict p10; \
|
||||
dim_t p10_dim, p10_len; \
|
||||
inc_t incc10, ldc10; \
|
||||
doff_t diagoffc10; \
|
||||
conj_t conjc10; \
|
||||
\
|
||||
ctype* restrict c12; \
|
||||
ctype_r* restrict p12; \
|
||||
dim_t p12_dim, p12_len; \
|
||||
inc_t incc12, ldc12; \
|
||||
doff_t diagoffc12; \
|
||||
conj_t conjc12; \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc < 0 ) || \
|
||||
( row_stored && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to p10. For upper storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, is_p, ldp ); \
|
||||
\
|
||||
/* Pack to p12. For lower storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,kername)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, is_p, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangle of c11 to p11. */ \
|
||||
{ \
|
||||
inc_t cs_p11; \
|
||||
inc_t rs_p11; \
|
||||
dim_t p11_m = panel_dim; \
|
||||
dim_t p11_n = panel_dim; \
|
||||
inc_t rs_c11 = 2*rs_c; \
|
||||
inc_t cs_c11 = 2*cs_c; \
|
||||
dim_t j = diagoffc_abs; \
|
||||
ctype* c11 = ( ctype* )c + (j )*ldc; \
|
||||
ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \
|
||||
ctype_r* c11_r = ( ctype_r* )c11; \
|
||||
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
|
||||
ctype_r* p11_r = ( ctype_r* )p11; \
|
||||
ctype_r* p11_i = ( ctype_r* )p11 + is_p; \
|
||||
ctype_r* alpha_r = one_r; \
|
||||
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
|
||||
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
/* Compute the row and column strides of p11. */ \
|
||||
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
|
||||
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
|
||||
\
|
||||
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_r, \
|
||||
c11_r, rs_c11, cs_c11, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
|
||||
scaling by -1 if conjugation on c was requested. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_i, \
|
||||
c11_i, rs_c11, cs_c11, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix c is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of p11 in case the
|
||||
corresponding elements in c11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \
|
||||
\
|
||||
PASTEMAC(chr,set0s)( *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Apply kappa to the part of p11 that corresponds to the stored
|
||||
part of c11 that was copied above. */ \
|
||||
if ( bli_is_upper( uploc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_u)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
&kappa_r, \
|
||||
&kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_l)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
&kappa_r, \
|
||||
&kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \
|
||||
p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \
|
||||
p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_4m, packm_cxk_4m )
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Since we don't
|
||||
have the schema in scope, we must use the dimensions and strides
|
||||
of the micro-panel to determine whether it is row- or column-
|
||||
stored. */ \
|
||||
row_stored = bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
col_stored = bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ); \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, is_p, ldp ); \
|
||||
\
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
|
||||
inc_t cs_p11; \
|
||||
inc_t rs_p11; \
|
||||
\
|
||||
/* Compute the row and column strides of p11. */ \
|
||||
if ( row_stored ) { rs_p11 = rs_p; cs_p11 = 1; } \
|
||||
else /* if ( col_stored ) */ { rs_p11 = 1; cs_p11 = cs_p; } \
|
||||
\
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_i, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
dim_t i; \
|
||||
\
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_4m, packm_cxk_4m )
|
||||
|
||||
@@ -32,14 +32,16 @@
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
@@ -49,7 +51,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_gen_cxk )
|
||||
INSERT_GENTPROTCO_BASIC( packm_struc_cxk_4m )
|
||||
|
||||
|
||||
|
||||
@@ -65,11 +67,41 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_gen_cxk_4m )
|
||||
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_4m )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_4m )
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_gen_cxk_3m )
|
||||
@@ -1,720 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
\
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,setd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
PASTEMAC(ch,invertd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
uplo_t uplop = uploc; \
|
||||
\
|
||||
bli_toggle_uplo( uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
|
||||
\
|
||||
PASTEMAC(ch,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* p_br = p + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one, \
|
||||
p_br, rs_p, cs_p ); \
|
||||
} \
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: setting br unit diag", m_br, n_br, \
|
||||
p_edge, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
/*
|
||||
if ( rs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
if ( cs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( packm_tri_cxk )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
\
|
||||
dim_t i; \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
inc_t rs_p11, cs_p11; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
rs_p11 = 1; \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_4m)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_i, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
\
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
|
||||
ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one_r, \
|
||||
p_br_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
zero_r, \
|
||||
p_br_i, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_4m )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
\
|
||||
dim_t i; \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
inc_t rs_p11, cs_p11; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
rs_p11 = 1; \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_3m)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \
|
||||
ctype_r* p11_rpi = ( ctype_r* )p + 2*psp + (j )*ldp; \
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_i, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_r, \
|
||||
p11_rpi, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. Note
|
||||
that we do not need to update the ri panel since inverted
|
||||
diagonals are only needed by trsm, which does not use the
|
||||
p11 section of the ri panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_rpi, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_rpi, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_rpi, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
|
||||
ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one_r, \
|
||||
p_br_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
zero_r, \
|
||||
p_br_i, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_3m )
|
||||
|
||||
@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -54,8 +54,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -133,7 +133,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -145,8 +145,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -232,7 +232,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -244,8 +244,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -339,7 +339,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -351,8 +351,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -454,7 +454,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -466,8 +466,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -577,7 +577,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -589,8 +589,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -708,7 +708,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -720,8 +720,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -847,7 +847,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -859,8 +859,8 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* restrict pi1_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
|
||||
@@ -40,7 +40,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_2xk_3m )
|
||||
|
||||
@@ -42,7 +42,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -128,7 +128,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -140,7 +140,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -222,7 +222,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -234,7 +234,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -324,7 +324,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -336,7 +336,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -434,7 +434,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -446,7 +446,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -552,7 +552,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -564,7 +564,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -678,7 +678,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -690,7 +690,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
@@ -812,7 +812,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
@@ -824,7 +824,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
{ \
|
||||
|
||||
@@ -40,7 +40,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
void* p, inc_t is_p, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_2xk_4m )
|
||||
|
||||
@@ -119,7 +119,7 @@ void bli_gemm3m_cntl_init()
|
||||
gemm3m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_mr,
|
||||
gemm3m_kr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
@@ -132,7 +132,7 @@ void bli_gemm3m_cntl_init()
|
||||
gemm3m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_kr,
|
||||
gemm3m_nr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
|
||||
@@ -119,7 +119,7 @@ void bli_gemm4m_cntl_init()
|
||||
gemm4m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_mr,
|
||||
gemm4m_kr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
@@ -132,7 +132,7 @@ void bli_gemm4m_cntl_init()
|
||||
gemm4m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_kr,
|
||||
gemm4m_nr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
|
||||
@@ -62,7 +62,7 @@ void bli_herk3m_cntl_init()
|
||||
herk3m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_mr,
|
||||
gemm3m_kr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
@@ -75,7 +75,7 @@ void bli_herk3m_cntl_init()
|
||||
herk3m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_kr,
|
||||
gemm3m_nr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
|
||||
@@ -62,7 +62,7 @@ void bli_herk4m_cntl_init()
|
||||
herk4m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_mr,
|
||||
gemm4m_kr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
@@ -75,7 +75,7 @@ void bli_herk4m_cntl_init()
|
||||
herk4m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_kr,
|
||||
gemm4m_nr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
|
||||
@@ -73,7 +73,7 @@ void bli_trmm3m_cntl_init()
|
||||
trmm3m_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm3m_mr,
|
||||
@@ -88,7 +88,7 @@ void bli_trmm3m_cntl_init()
|
||||
trmm3m_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm3m_mr,
|
||||
@@ -104,7 +104,7 @@ void bli_trmm3m_cntl_init()
|
||||
trmm3m_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to nr.
|
||||
gemm3m_mr,
|
||||
@@ -119,7 +119,7 @@ void bli_trmm3m_cntl_init()
|
||||
trmm3m_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple here must be nr
|
||||
// since "k" dim multiple is set to nr above.
|
||||
gemm3m_nr,
|
||||
|
||||
@@ -73,7 +73,7 @@ void bli_trmm4m_cntl_init()
|
||||
trmm4m_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm4m_mr,
|
||||
@@ -88,7 +88,7 @@ void bli_trmm4m_cntl_init()
|
||||
trmm4m_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm4m_mr,
|
||||
@@ -104,7 +104,7 @@ void bli_trmm4m_cntl_init()
|
||||
trmm4m_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to nr.
|
||||
gemm4m_mr,
|
||||
@@ -119,7 +119,7 @@ void bli_trmm4m_cntl_init()
|
||||
trmm4m_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple here must be nr
|
||||
// since "k" dim multiple is set to nr above.
|
||||
gemm4m_nr,
|
||||
|
||||
@@ -92,7 +92,7 @@ void bli_trsm3m_cntl_init()
|
||||
trsm3m_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: n dim multiple must be mr to
|
||||
// support right and bottom-right edge cases
|
||||
gemm3m_mr,
|
||||
@@ -107,7 +107,7 @@ void bli_trsm3m_cntl_init()
|
||||
trsm3m_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple must be mr since
|
||||
// B_pack is updated (ie: serves as C) in trsm
|
||||
gemm3m_mr,
|
||||
@@ -123,7 +123,7 @@ void bli_trsm3m_cntl_init()
|
||||
trsm3m_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_nr,
|
||||
gemm3m_mr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
@@ -136,7 +136,7 @@ void bli_trsm3m_cntl_init()
|
||||
trsm3m_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_mr,
|
||||
gemm3m_mr,
|
||||
TRUE, // densify
|
||||
|
||||
@@ -93,7 +93,7 @@ void bli_trsm4m_cntl_init()
|
||||
trsm4m_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: n dim multiple must be mr to
|
||||
// support right and bottom-right edge cases
|
||||
gemm4m_mr,
|
||||
@@ -108,7 +108,7 @@ void bli_trsm4m_cntl_init()
|
||||
trsm4m_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple must be mr since
|
||||
// B_pack is updated (ie: serves as C) in trsm
|
||||
gemm4m_mr,
|
||||
@@ -124,7 +124,7 @@ void bli_trsm4m_cntl_init()
|
||||
trsm4m_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_nr,
|
||||
gemm4m_mr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
@@ -137,7 +137,7 @@ void bli_trsm4m_cntl_init()
|
||||
trsm4m_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_mr,
|
||||
gemm4m_mr,
|
||||
TRUE, // densify
|
||||
|
||||
@@ -101,5 +101,31 @@ typedef void \
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_ukr_t )
|
||||
|
||||
|
||||
// -- packm_struc_cxk kernel --
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, tname ) \
|
||||
\
|
||||
typedef void \
|
||||
(*PASTECH(ch,tname))( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ker_t )
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user