mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Consolidated packm_blk_var1 and packm_blk_var2.
Details: - Consolidated the two blocked variants for packm into a single implementation (packm_blk_var1) and removed the other variant. - Updated all induced method _cntl_init() functions in frame/cntl/ind/ to use the new blocked variant 1. - Defined two new macros, bli_is_ind_packed() and bli_is_nat_packed(), to detect pack_t schemas for induced methods and native execution, respectively.
This commit is contained in:
@@ -42,7 +42,6 @@
|
||||
#include "bli_packm_unb_var1.h"
|
||||
|
||||
#include "bli_packm_blk_var1.h"
|
||||
#include "bli_packm_blk_var2.h"
|
||||
|
||||
#include "bli_packm_struc_cxk.h"
|
||||
#include "bli_packm_struc_cxk_4mi.h"
|
||||
|
||||
@@ -62,6 +62,9 @@ typedef void (*FUNCPTR_T)(
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
|
||||
extern func_t* packm_struc_cxk_kers;
|
||||
extern func_t* packm_struc_cxk_4mi_kers;
|
||||
extern func_t* packm_struc_cxk_3mis_kers;
|
||||
extern func_t* packm_struc_cxk_rih_kers;
|
||||
|
||||
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
@@ -96,6 +99,8 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
dim_t pd_p = bli_obj_panel_dim( *p );
|
||||
inc_t ps_p = bli_obj_panel_stride( *p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
func_t* packm_kers;
|
||||
@@ -103,14 +108,61 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// This variant assumes that the micro-kernel will always apply the
|
||||
// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
|
||||
// for kappa so that the underlying packm implementation does not
|
||||
// scale during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
|
||||
// Treatment of kappa (ie: packing during scaling) depends on
|
||||
// whether we are executing an induced method.
|
||||
if ( bli_is_ind_packed( schema ) )
|
||||
{
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if( thread_am_ochief( t ) )
|
||||
{
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
|
||||
}
|
||||
else // if ( bli_is_nat_packed( schema ) )
|
||||
{
|
||||
// This branch if for native execution, where we assume that
|
||||
// the micro-kernel will always apply the alpha scalar of the
|
||||
// higher-level operation. Thus, we use BLIS_ONE for kappa so
|
||||
// that the underlying packm implementation does not perform
|
||||
// any scaling during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
|
||||
}
|
||||
|
||||
// Choose the correct func_t object.
|
||||
packm_kers = packm_struc_cxk_kers;
|
||||
|
||||
// Choose the correct func_t object based on the pack_t schema.
|
||||
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
|
||||
else if ( bli_is_3mi_packed( schema ) ||
|
||||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
|
||||
else if ( bli_is_ro_packed( schema ) ||
|
||||
bli_is_io_packed( schema ) ||
|
||||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
|
||||
else packm_kers = packm_struc_cxk_kers;
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
|
||||
@@ -144,8 +196,8 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kertype ) \
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kertype ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
@@ -204,6 +256,9 @@ void PASTEMAC(ch,varname)( \
|
||||
conj_t conjc; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
inc_t is_p_use; \
|
||||
dim_t ss_num; \
|
||||
dim_t ss_den; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
@@ -274,6 +329,17 @@ void PASTEMAC(ch,varname)( \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1. However,
|
||||
in the case of interleaved 3m, we need to scale by 3/2, and in the
|
||||
cases of real-only, imag-only, or summed-only, we need to scale by
|
||||
1/2. In both cases, we are compensating for the fact that pointer
|
||||
arithmetic occurs in terms of complex elements rather than real
|
||||
elements. */ \
|
||||
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
|
||||
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else { ss_num = 1; ss_den = 1; } \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
@@ -297,6 +363,15 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
@@ -353,6 +428,15 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
/* We need to re-compute the imaginary stride as a function of
|
||||
panel_len_max_i since triangular packed matrices have panels
|
||||
of varying lengths. NOTE: This imaginary stride value is
|
||||
only referenced by the packm kernels for induced methods. */ \
|
||||
is_p_use = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the imaginary stride up by one if it is odd. */ \
|
||||
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
@@ -370,25 +454,27 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p ); \
|
||||
is_p_use ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the panel increment up by one if it is odd. */ \
|
||||
p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
|
||||
p_inc = ( is_p_use * ss_num ) / ss_den; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
@@ -404,13 +490,11 @@ void PASTEMAC(ch,varname)( \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, \
|
||||
is_p ); \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
/*p_inc = ldp * panel_len_max_i;*/ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
else \
|
||||
@@ -418,9 +502,14 @@ void PASTEMAC(ch,varname)( \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
@@ -436,28 +525,81 @@ void PASTEMAC(ch,varname)( \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, \
|
||||
is_p ); \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use ); \
|
||||
} \
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
else if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
/*p_inc = ldp * panel_len_max_i;*/ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema ) ) { \
|
||||
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
|
||||
if ( col_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
if ( row_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
} \
|
||||
*/ \
|
||||
/*
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
*/ \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
inc_t is_b = rs_p * *m_panel_max; \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )
|
||||
INSERT_GENTFUNCR_BASIC( packm_blk_var1, packm_ker_t )
|
||||
|
||||
|
||||
@@ -59,15 +59,12 @@ typedef void (*FUNCPTR_T)(
|
||||
packm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2);
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
|
||||
extern func_t* packm_struc_cxk_kers;
|
||||
extern func_t* packm_struc_cxk_4mi_kers;
|
||||
extern func_t* packm_struc_cxk_3mis_kers;
|
||||
extern func_t* packm_struc_cxk_rih_kers;
|
||||
|
||||
|
||||
void bli_packm_blk_var2( obj_t* c,
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t )
|
||||
{
|
||||
@@ -99,8 +96,6 @@ void bli_packm_blk_var2( obj_t* c,
|
||||
dim_t pd_p = bli_obj_panel_dim( *p );
|
||||
inc_t ps_p = bli_obj_panel_stride( *p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
func_t* packm_kers;
|
||||
@@ -108,59 +103,14 @@ void bli_packm_blk_var2( obj_t* c,
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// This variant assumes that the micro-kernel will always apply the
|
||||
// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
|
||||
// for kappa so that the underlying packm implementation does not
|
||||
// scale during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );
|
||||
|
||||
/*
|
||||
// We want this variant to behave identically to that of variant 1
|
||||
// in the real domain.
|
||||
if ( bli_is_real( dt_cp ) )
|
||||
{
|
||||
bli_packm_blk_var1( c, p, t );
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if( thread_am_ochief( t ) )
|
||||
{
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
|
||||
|
||||
|
||||
// Choose the correct func_t object based on the pack_t schema.
|
||||
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
|
||||
else if ( bli_is_3mi_packed( schema ) ||
|
||||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
|
||||
else if ( bli_is_ro_packed( schema ) ||
|
||||
bli_is_io_packed( schema ) ||
|
||||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
|
||||
else packm_kers = packm_struc_cxk_kers;
|
||||
// Choose the correct func_t object.
|
||||
packm_kers = packm_struc_cxk_kers;
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
|
||||
@@ -168,9 +118,7 @@ void bli_packm_blk_var2( obj_t* c,
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
//f = ftypes[dt_cp];
|
||||
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var2;
|
||||
else f = bli_zpackm_blk_var2;
|
||||
f = ftypes[dt_cp];
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
@@ -196,8 +144,8 @@ void bli_packm_blk_var2( obj_t* c,
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kertype ) \
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kertype ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
@@ -256,9 +204,6 @@ void PASTEMAC(ch,varname)( \
|
||||
conj_t conjc; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
inc_t is_p_use; \
|
||||
dim_t ss_num; \
|
||||
dim_t ss_den; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
@@ -329,17 +274,6 @@ void PASTEMAC(ch,varname)( \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1. However,
|
||||
in the case of interleaved 3m, we need to scale by 3/2, and in the
|
||||
cases of real-only, imag-only, or summed-only, we need to scale by
|
||||
1/2. In both cases, we are compensating for the fact that pointer
|
||||
arithmetic occurs in terms of complex elements rather than real
|
||||
elements. */ \
|
||||
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
|
||||
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else { ss_num = 1; ss_den = 1; } \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
@@ -363,15 +297,6 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
@@ -428,14 +353,6 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
/* We need to re-compute the imaginary stride as a function of
|
||||
panel_len_max_i since triangular packed matrices have panels
|
||||
of varying lengths. */ \
|
||||
is_p_use = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the imaginary stride up by one if it is odd. */ \
|
||||
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
@@ -453,31 +370,25 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use ); \
|
||||
is_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
/*
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the panel increment up by one if it is odd. */ \
|
||||
p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
|
||||
*/ \
|
||||
p_inc = ( is_p_use * ss_num ) / ss_den; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
@@ -493,12 +404,13 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use ); \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, \
|
||||
is_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
/*p_inc = ldp * panel_len_max_i;*/ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
else \
|
||||
@@ -506,14 +418,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
@@ -529,81 +436,28 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use ); \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p, \
|
||||
is_p ); \
|
||||
} \
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
else if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
/*p_inc = ldp * panel_len_max_i;*/ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema ) ) { \
|
||||
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
|
||||
if ( col_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
if ( row_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
} \
|
||||
*/ \
|
||||
/*
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
*/ \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
inc_t is_b = rs_p * *m_panel_max; \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_blk_var2, packm_ker_t )
|
||||
INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var2( obj_t* c,
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t );
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
packm_thrinfo_t* t \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var2 )
|
||||
|
||||
@@ -44,7 +44,7 @@ static FUNCPTR_T vars[6][3] =
|
||||
{
|
||||
// unblocked optimized unblocked blocked
|
||||
{ bli_packm_unb_var1, NULL, bli_packm_blk_var1 },
|
||||
{ NULL, NULL, bli_packm_blk_var2 },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
|
||||
@@ -654,6 +654,14 @@
|
||||
bli_is_io_packed( schema ) || \
|
||||
bli_is_rpi_packed( schema ) )
|
||||
|
||||
#define bli_is_nat_packed( schema ) \
|
||||
\
|
||||
( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 )
|
||||
|
||||
#define bli_is_ind_packed( schema ) \
|
||||
\
|
||||
( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 )
|
||||
|
||||
|
||||
// pointer-related
|
||||
|
||||
@@ -668,9 +676,6 @@
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// return datatype for char
|
||||
|
||||
#define bli_stype ( BLIS_FLOAT )
|
||||
|
||||
@@ -138,7 +138,7 @@ void bli_gemm3m1_cntl_init()
|
||||
gemm3m1_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m1_mr,
|
||||
gemm3m1_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -150,7 +150,7 @@ void bli_gemm3m1_cntl_init()
|
||||
gemm3m1_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m1_kr,
|
||||
gemm3m1_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -146,7 +146,7 @@ void bli_gemm3m2_cntl_init()
|
||||
gemm3m2_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m2_mr,
|
||||
gemm3m2_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -158,7 +158,7 @@ void bli_gemm3m2_cntl_init()
|
||||
gemm3m2_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m2_kr,
|
||||
gemm3m2_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -144,7 +144,7 @@ void bli_gemm3m3_cntl_init()
|
||||
gemm3m3_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m3_kr,
|
||||
gemm3m3_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -143,7 +143,7 @@ void bli_gemm3mh_cntl_init()
|
||||
gemm3mh_packa_cntl_ro
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3mh_mr,
|
||||
gemm3mh_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -155,7 +155,7 @@ void bli_gemm3mh_cntl_init()
|
||||
gemm3mh_packb_cntl_ro
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3mh_kr,
|
||||
gemm3mh_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -168,7 +168,7 @@ void bli_gemm3mh_cntl_init()
|
||||
gemm3mh_packa_cntl_io
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3mh_mr,
|
||||
gemm3mh_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -180,7 +180,7 @@ void bli_gemm3mh_cntl_init()
|
||||
gemm3mh_packb_cntl_io
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3mh_kr,
|
||||
gemm3mh_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -193,7 +193,7 @@ void bli_gemm3mh_cntl_init()
|
||||
gemm3mh_packa_cntl_rpi
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3mh_mr,
|
||||
gemm3mh_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -205,7 +205,7 @@ void bli_gemm3mh_cntl_init()
|
||||
gemm3mh_packb_cntl_rpi
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3mh_kr,
|
||||
gemm3mh_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -135,7 +135,7 @@ void bli_gemm4m1_cntl_init()
|
||||
gemm4m1_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m1_mr,
|
||||
gemm4m1_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -147,7 +147,7 @@ void bli_gemm4m1_cntl_init()
|
||||
gemm4m1_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m1_kr,
|
||||
gemm4m1_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -137,7 +137,7 @@ void bli_gemm4mb_cntl_init()
|
||||
gemm4mb_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4mb_mr,
|
||||
gemm4mb_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -149,7 +149,7 @@ void bli_gemm4mb_cntl_init()
|
||||
gemm4mb_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4mb_kr,
|
||||
gemm4mb_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -145,7 +145,7 @@ void bli_gemm4mh_cntl_init()
|
||||
gemm4mh_packa_cntl_ro
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4mh_mr,
|
||||
gemm4mh_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -157,7 +157,7 @@ void bli_gemm4mh_cntl_init()
|
||||
gemm4mh_packb_cntl_ro
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4mh_kr,
|
||||
gemm4mh_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -170,7 +170,7 @@ void bli_gemm4mh_cntl_init()
|
||||
gemm4mh_packa_cntl_io
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4mh_mr,
|
||||
gemm4mh_kr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -182,7 +182,7 @@ void bli_gemm4mh_cntl_init()
|
||||
gemm4mh_packb_cntl_io
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4mh_kr,
|
||||
gemm4mh_nr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -112,7 +112,7 @@ void bli_trsm3m1_cntl_init()
|
||||
trsm3m1_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: n dim multiple must be mr to
|
||||
// support right and bottom-right edge cases
|
||||
gemm3m1_mr,
|
||||
@@ -126,7 +126,7 @@ void bli_trsm3m1_cntl_init()
|
||||
trsm3m1_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: m dim multiple must be mr since
|
||||
// B_pack is updated (ie: serves as C) in trsm
|
||||
gemm3m1_mr,
|
||||
@@ -141,7 +141,7 @@ void bli_trsm3m1_cntl_init()
|
||||
trsm3m1_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m1_nr,
|
||||
gemm3m1_mr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -153,7 +153,7 @@ void bli_trsm3m1_cntl_init()
|
||||
trsm3m1_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m1_mr,
|
||||
gemm3m1_mr,
|
||||
TRUE, // invert diagonal
|
||||
|
||||
@@ -112,7 +112,7 @@ void bli_trsm4m1_cntl_init()
|
||||
trsm4m1_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: n dim multiple must be mr to
|
||||
// support right and bottom-right edge cases
|
||||
gemm4m1_mr,
|
||||
@@ -126,7 +126,7 @@ void bli_trsm4m1_cntl_init()
|
||||
trsm4m1_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: m dim multiple must be mr since
|
||||
// B_pack is updated (ie: serves as C) in trsm
|
||||
gemm4m1_mr,
|
||||
@@ -141,7 +141,7 @@ void bli_trsm4m1_cntl_init()
|
||||
trsm4m1_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m1_nr,
|
||||
gemm4m1_mr,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -153,7 +153,7 @@ void bli_trsm4m1_cntl_init()
|
||||
trsm4m1_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m1_mr,
|
||||
gemm4m1_mr,
|
||||
TRUE, // invert diagonal
|
||||
|
||||
Reference in New Issue
Block a user