Consolidated packm_blk_var1 and packm_blk_var2.

Details:
- Consolidated the two blocked variants for packm into a single
  implementation (packm_blk_var1) and removed the other variant.
- Updated all induced method _cntl_init() functions in frame/cntl/ind/
  to use the new blocked variant 1.
- Defined two new macros, bli_is_ind_packed() and bli_is_nat_packed(),
  to detect pack_t schemas for induced methods and native execution,
  respectively.
This commit is contained in:
Field G. Van Zee
2015-11-13 16:29:12 -06:00
parent 30e5eb29e0
commit 0b126de134
15 changed files with 242 additions and 309 deletions

View File

@@ -42,7 +42,6 @@
#include "bli_packm_unb_var1.h"
#include "bli_packm_blk_var1.h"
#include "bli_packm_blk_var2.h"
#include "bli_packm_struc_cxk.h"
#include "bli_packm_struc_cxk_4mi.h"

View File

@@ -62,6 +62,9 @@ typedef void (*FUNCPTR_T)(
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
extern func_t* packm_struc_cxk_kers;
extern func_t* packm_struc_cxk_4mi_kers;
extern func_t* packm_struc_cxk_3mis_kers;
extern func_t* packm_struc_cxk_rih_kers;
void bli_packm_blk_var1( obj_t* c,
@@ -96,6 +99,8 @@ void bli_packm_blk_var1( obj_t* c,
dim_t pd_p = bli_obj_panel_dim( *p );
inc_t ps_p = bli_obj_panel_stride( *p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
@@ -103,14 +108,61 @@ void bli_packm_blk_var1( obj_t* c,
FUNCPTR_T f;
// This variant assumes that the micro-kernel will always apply the
// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
// for kappa so that the underlying packm implementation does not
// scale during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_ind_packed( schema ) )
{
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if( thread_am_ochief( t ) )
{
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = κ
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
}
else // if ( bli_is_nat_packed( schema ) )
{
// This branch if for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
}
// Choose the correct func_t object.
packm_kers = packm_struc_cxk_kers;
// Choose the correct func_t object based on the pack_t schema.
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
@@ -144,8 +196,8 @@ void bli_packm_blk_var1( obj_t* c,
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kertype ) \
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kertype ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
@@ -204,6 +256,9 @@ void PASTEMAC(ch,varname)( \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
@@ -274,6 +329,17 @@ void PASTEMAC(ch,varname)( \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -297,6 +363,15 @@ void PASTEMAC(ch,varname)( \
} \
\
p_begin = p_cast; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/ \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
@@ -353,6 +428,15 @@ void PASTEMAC(ch,varname)( \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
@@ -370,25 +454,27 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p ); \
is_p_use ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ldp * panel_len_max_i; \
\
/* We nudge the panel increment up by one if it is odd. */ \
p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
@@ -404,13 +490,11 @@ void PASTEMAC(ch,varname)( \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, \
is_p ); \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
/*p_inc = ldp * panel_len_max_i;*/ \
p_inc = ps_p; \
} \
else \
@@ -418,9 +502,14 @@ void PASTEMAC(ch,varname)( \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
@@ -436,28 +525,81 @@ void PASTEMAC(ch,varname)( \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, \
is_p ); \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use ); \
} \
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
else if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
*/ \
\
/* NOTE: This value is equivalent to ps_p. */ \
/*p_inc = ldp * panel_len_max_i;*/ \
p_inc = ps_p; \
} \
\
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/ \
/*
*/ \
\
/*
*/ \
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/ \
\
\
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
\
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
p_begin += p_inc; \
\
} \
}
INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )
INSERT_GENTFUNCR_BASIC( packm_blk_var1, packm_ker_t )

View File

@@ -59,15 +59,12 @@ typedef void (*FUNCPTR_T)(
packm_thrinfo_t* thread
);
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
extern func_t* packm_struc_cxk_kers;
extern func_t* packm_struc_cxk_4mi_kers;
extern func_t* packm_struc_cxk_3mis_kers;
extern func_t* packm_struc_cxk_rih_kers;
void bli_packm_blk_var2( obj_t* c,
void bli_packm_blk_var1( obj_t* c,
obj_t* p,
packm_thrinfo_t* t )
{
@@ -99,8 +96,6 @@ void bli_packm_blk_var2( obj_t* c,
dim_t pd_p = bli_obj_panel_dim( *p );
inc_t ps_p = bli_obj_panel_stride( *p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
@@ -108,59 +103,14 @@ void bli_packm_blk_var2( obj_t* c,
FUNCPTR_T f;
// This variant assumes that the micro-kernel will always apply the
// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
// for kappa so that the underlying packm implementation does not
// scale during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
/*
// We want this variant to behave identically to that of variant 1
// in the real domain.
if ( bli_is_real( dt_cp ) )
{
bli_packm_blk_var1( c, p, t );
return;
}
*/
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if( thread_am_ochief( t ) )
{
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
// Choose the correct func_t object based on the pack_t schema.
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
// Choose the correct func_t object.
packm_kers = packm_struc_cxk_kers;
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
@@ -168,9 +118,7 @@ void bli_packm_blk_var2( obj_t* c,
// Index into the type combination array to extract the correct
// function pointer.
//f = ftypes[dt_cp];
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var2;
else f = bli_zpackm_blk_var2;
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
@@ -196,8 +144,8 @@ void bli_packm_blk_var2( obj_t* c,
}
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kertype ) \
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kertype ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
@@ -256,9 +204,6 @@ void PASTEMAC(ch,varname)( \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
@@ -329,17 +274,6 @@ void PASTEMAC(ch,varname)( \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -363,15 +297,6 @@ void PASTEMAC(ch,varname)( \
} \
\
p_begin = p_cast; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/ \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
@@ -428,14 +353,6 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
@@ -453,31 +370,25 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use ); \
is_p ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
/*
p_inc = ldp * panel_len_max_i; \
\
/* We nudge the panel increment up by one if it is odd. */ \
p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
*/ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
@@ -493,12 +404,13 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use ); \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, \
is_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
/*p_inc = ldp * panel_len_max_i;*/ \
p_inc = ps_p; \
} \
else \
@@ -506,14 +418,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
@@ -529,81 +436,28 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use ); \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, \
is_p ); \
} \
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
else if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
*/ \
\
/* NOTE: This value is equivalent to ps_p. */ \
/*p_inc = ldp * panel_len_max_i;*/ \
p_inc = ps_p; \
} \
\
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/ \
/*
*/ \
\
/*
*/ \
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/ \
\
\
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
\
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
p_begin += p_inc; \
\
} \
}
INSERT_GENTFUNCCO_BASIC( packm_blk_var2, packm_ker_t )
INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )

View File

@@ -1,67 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packm_blk_var2( obj_t* c,
obj_t* p,
packm_thrinfo_t* t );
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
packm_thrinfo_t* t \
);
INSERT_GENTPROTCO_BASIC( packm_blk_var2 )

View File

@@ -44,7 +44,7 @@ static FUNCPTR_T vars[6][3] =
{
// unblocked optimized unblocked blocked
{ bli_packm_unb_var1, NULL, bli_packm_blk_var1 },
{ NULL, NULL, bli_packm_blk_var2 },
{ NULL, NULL, NULL, },
{ NULL, NULL, NULL, },
{ NULL, NULL, NULL, },
{ NULL, NULL, NULL, },

View File

@@ -654,6 +654,14 @@
bli_is_io_packed( schema ) || \
bli_is_rpi_packed( schema ) )
#define bli_is_nat_packed( schema ) \
\
( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 )
#define bli_is_ind_packed( schema ) \
\
( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 )
// pointer-related
@@ -668,9 +676,6 @@
}
// return datatype for char
#define bli_stype ( BLIS_FLOAT )

View File

@@ -138,7 +138,7 @@ void bli_gemm3m1_cntl_init()
gemm3m1_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3m1_mr,
gemm3m1_kr,
FALSE, // do NOT invert diagonal
@@ -150,7 +150,7 @@ void bli_gemm3m1_cntl_init()
gemm3m1_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3m1_kr,
gemm3m1_nr,
FALSE, // do NOT invert diagonal

View File

@@ -146,7 +146,7 @@ void bli_gemm3m2_cntl_init()
gemm3m2_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3m2_mr,
gemm3m2_kr,
FALSE, // do NOT invert diagonal
@@ -158,7 +158,7 @@ void bli_gemm3m2_cntl_init()
gemm3m2_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3m2_kr,
gemm3m2_nr,
FALSE, // do NOT invert diagonal

View File

@@ -144,7 +144,7 @@ void bli_gemm3m3_cntl_init()
gemm3m3_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3m3_kr,
gemm3m3_nr,
FALSE, // do NOT invert diagonal

View File

@@ -143,7 +143,7 @@ void bli_gemm3mh_cntl_init()
gemm3mh_packa_cntl_ro
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3mh_mr,
gemm3mh_kr,
FALSE, // do NOT invert diagonal
@@ -155,7 +155,7 @@ void bli_gemm3mh_cntl_init()
gemm3mh_packb_cntl_ro
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3mh_kr,
gemm3mh_nr,
FALSE, // do NOT invert diagonal
@@ -168,7 +168,7 @@ void bli_gemm3mh_cntl_init()
gemm3mh_packa_cntl_io
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3mh_mr,
gemm3mh_kr,
FALSE, // do NOT invert diagonal
@@ -180,7 +180,7 @@ void bli_gemm3mh_cntl_init()
gemm3mh_packb_cntl_io
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3mh_kr,
gemm3mh_nr,
FALSE, // do NOT invert diagonal
@@ -193,7 +193,7 @@ void bli_gemm3mh_cntl_init()
gemm3mh_packa_cntl_rpi
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3mh_mr,
gemm3mh_kr,
FALSE, // do NOT invert diagonal
@@ -205,7 +205,7 @@ void bli_gemm3mh_cntl_init()
gemm3mh_packb_cntl_rpi
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3mh_kr,
gemm3mh_nr,
FALSE, // do NOT invert diagonal

View File

@@ -135,7 +135,7 @@ void bli_gemm4m1_cntl_init()
gemm4m1_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4m1_mr,
gemm4m1_kr,
FALSE, // do NOT invert diagonal
@@ -147,7 +147,7 @@ void bli_gemm4m1_cntl_init()
gemm4m1_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4m1_kr,
gemm4m1_nr,
FALSE, // do NOT invert diagonal

View File

@@ -137,7 +137,7 @@ void bli_gemm4mb_cntl_init()
gemm4mb_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4mb_mr,
gemm4mb_kr,
FALSE, // do NOT invert diagonal
@@ -149,7 +149,7 @@ void bli_gemm4mb_cntl_init()
gemm4mb_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4mb_kr,
gemm4mb_nr,
FALSE, // do NOT invert diagonal

View File

@@ -145,7 +145,7 @@ void bli_gemm4mh_cntl_init()
gemm4mh_packa_cntl_ro
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4mh_mr,
gemm4mh_kr,
FALSE, // do NOT invert diagonal
@@ -157,7 +157,7 @@ void bli_gemm4mh_cntl_init()
gemm4mh_packb_cntl_ro
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4mh_kr,
gemm4mh_nr,
FALSE, // do NOT invert diagonal
@@ -170,7 +170,7 @@ void bli_gemm4mh_cntl_init()
gemm4mh_packa_cntl_io
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4mh_mr,
gemm4mh_kr,
FALSE, // do NOT invert diagonal
@@ -182,7 +182,7 @@ void bli_gemm4mh_cntl_init()
gemm4mh_packb_cntl_io
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4mh_kr,
gemm4mh_nr,
FALSE, // do NOT invert diagonal

View File

@@ -112,7 +112,7 @@ void bli_trsm3m1_cntl_init()
trsm3m1_l_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
// IMPORTANT: n dim multiple must be mr to
// support right and bottom-right edge cases
gemm3m1_mr,
@@ -126,7 +126,7 @@ void bli_trsm3m1_cntl_init()
trsm3m1_l_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
// IMPORTANT: m dim multiple must be mr since
// B_pack is updated (ie: serves as C) in trsm
gemm3m1_mr,
@@ -141,7 +141,7 @@ void bli_trsm3m1_cntl_init()
trsm3m1_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3m1_nr,
gemm3m1_mr,
FALSE, // do NOT invert diagonal
@@ -153,7 +153,7 @@ void bli_trsm3m1_cntl_init()
trsm3m1_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm3m1_mr,
gemm3m1_mr,
TRUE, // invert diagonal

View File

@@ -112,7 +112,7 @@ void bli_trsm4m1_cntl_init()
trsm4m1_l_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
// IMPORTANT: n dim multiple must be mr to
// support right and bottom-right edge cases
gemm4m1_mr,
@@ -126,7 +126,7 @@ void bli_trsm4m1_cntl_init()
trsm4m1_l_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
// IMPORTANT: m dim multiple must be mr since
// B_pack is updated (ie: serves as C) in trsm
gemm4m1_mr,
@@ -141,7 +141,7 @@ void bli_trsm4m1_cntl_init()
trsm4m1_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4m1_nr,
gemm4m1_mr,
FALSE, // do NOT invert diagonal
@@ -153,7 +153,7 @@ void bli_trsm4m1_cntl_init()
trsm4m1_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_VARIANT1,
gemm4m1_mr,
gemm4m1_mr,
TRUE, // invert diagonal