mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Consolidated slab/rr-explicit level-3 macrokernels.
Details:
- Consolidated the *sl.c and *rr.c level-3 macrokernels into a single
file per sl/rr pair, with those files named as they were before
c92762e. The consolidation does not take away the *option* of using
slab or round-robin assignment of micropanels to threads; it merely
*hides* the choice within the definitions of functions such as
bli_thread_range_jrir(), bli_packm_my_iter(), and bli_is_last_iter()
rather than expose that choice explicitly in the code. The choice of
slab or rr is not always hidden, however; there are some cases
involving herk and trmm, for example, that require some part of the
computation to use rr unconditionally. (The --thread-part-jrir option
controls the partitioning in all other cases.)
- Note: Originally, the sl and rr macrokernels were separated out for
clarity. However, aside from the additional binary code bloat, I later
deemed that clarity not worth the price of maintaining the additional
(mostly similar) codes.
This commit is contained in:
@@ -35,6 +35,72 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
pack_t schema,
|
||||
bool_t invdiag,
|
||||
bool_t revifup,
|
||||
bool_t reviflo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
inc_t is_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
void* packm_ker,
|
||||
cntx_t* cntx,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
|
||||
|
||||
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
{
|
||||
/* float (0) scomplex (1) double (2) dcomplex (3) */
|
||||
// 0000 row/col panels
|
||||
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
|
||||
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
|
||||
// 0001 row/col panels: 4m interleaved
|
||||
{ { NULL, bli_cpackm_struc_cxk_4mi,
|
||||
NULL, bli_zpackm_struc_cxk_4mi, } },
|
||||
// 0010 row/col panels: 3m interleaved
|
||||
{ { NULL, bli_cpackm_struc_cxk_3mis,
|
||||
NULL, bli_zpackm_struc_cxk_3mis, } },
|
||||
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
|
||||
{ { NULL, NULL,
|
||||
NULL, NULL, } },
|
||||
// 0100 row/col panels: 3m separated
|
||||
{ { NULL, bli_cpackm_struc_cxk_3mis,
|
||||
NULL, bli_zpackm_struc_cxk_3mis, } },
|
||||
// 0101 row/col panels: real only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 0110 row/col panels: imaginary only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 0111 row/col panels: real+imaginary only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 1000 row/col panels: 1m-expanded (1e)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
// 1001 row/col panels: 1m-reordered (1r)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
};
|
||||
|
||||
|
||||
void bli_packm_blk_var1
|
||||
(
|
||||
obj_t* c,
|
||||
@@ -44,14 +110,632 @@ void bli_packm_blk_var1
|
||||
thrinfo_t* t
|
||||
)
|
||||
{
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
num_t dt_cp = bli_obj_dt( c );
|
||||
|
||||
bli_packm_blk_var1sl( c, p, cntx, cntl, t );
|
||||
struc_t strucc = bli_obj_struc( c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
diag_t diagc = bli_obj_diag( c );
|
||||
uplo_t uploc = bli_obj_uplo( c );
|
||||
trans_t transc = bli_obj_conjtrans_status( c );
|
||||
pack_t schema = bli_obj_pack_schema( p );
|
||||
bool_t invdiag = bli_obj_has_inverted_diag( p );
|
||||
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
|
||||
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
|
||||
|
||||
#else // BLIS_ENABLE_JRIR_RR
|
||||
dim_t m_p = bli_obj_length( p );
|
||||
dim_t n_p = bli_obj_width( p );
|
||||
dim_t m_max_p = bli_obj_padded_length( p );
|
||||
dim_t n_max_p = bli_obj_padded_width( p );
|
||||
|
||||
bli_packm_blk_var1rr( c, p, cntx, cntl, t );
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( p );
|
||||
inc_t rs_p = bli_obj_row_stride( p );
|
||||
inc_t cs_p = bli_obj_col_stride( p );
|
||||
inc_t is_p = bli_obj_imag_stride( p );
|
||||
dim_t pd_p = bli_obj_panel_dim( p );
|
||||
inc_t ps_p = bli_obj_panel_stride( p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
func_t* packm_kers;
|
||||
void* packm_ker;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// Treatment of kappa (ie: packing during scaling) depends on
|
||||
// whether we are executing an induced method.
|
||||
if ( bli_is_nat_packed( schema ) )
|
||||
{
|
||||
// This branch is for native execution, where we assume that
|
||||
// the micro-kernel will always apply the alpha scalar of the
|
||||
// higher-level operation. Thus, we use BLIS_ONE for kappa so
|
||||
// that the underlying packm implementation does not perform
|
||||
// any scaling during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
|
||||
}
|
||||
else // if ( bli_is_ind_packed( schema ) )
|
||||
{
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
//printf( "applying non-zero imag kappa\n" );
|
||||
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
|
||||
}
|
||||
|
||||
|
||||
// Choose the correct func_t object based on the pack_t schema.
|
||||
#if 0
|
||||
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
|
||||
else if ( bli_is_3mi_packed( schema ) ||
|
||||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
|
||||
else if ( bli_is_ro_packed( schema ) ||
|
||||
bli_is_io_packed( schema ) ||
|
||||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
|
||||
else packm_kers = packm_struc_cxk_kers;
|
||||
#else
|
||||
// The original idea here was to read the packm_ukr from the context
|
||||
// if it is non-NULL. The problem is, it requires that we be able to
|
||||
// assume that the packm_ukr field is initialized to NULL, which it
|
||||
// currently is not.
|
||||
|
||||
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
|
||||
|
||||
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// NULL (which is the default value after the context is created),
|
||||
// we use the default lookup table to determine the right func_t
|
||||
// for the current schema.
|
||||
const dim_t i = bli_pack_schema_index( schema );
|
||||
|
||||
packm_kers = &packm_struc_cxk_kers[ i ];
|
||||
}
|
||||
#if 0
|
||||
else // cntx's packm func_t overrides
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// non-NULL (ie: assumed to be valid), we use that instead.
|
||||
//packm_kers = bli_cntx_packm_ukrs( cntx );
|
||||
packm_kers = cntx_packm_kers;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_cp];
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
schema,
|
||||
invdiag,
|
||||
revifup,
|
||||
reviflo,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
is_p,
|
||||
pd_p, ps_p,
|
||||
packm_ker,
|
||||
cntx,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
cntx_t* cntx, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
|
||||
\
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict c_begin; \
|
||||
ctype* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
inc_t is_p_use; \
|
||||
dim_t ss_num; \
|
||||
dim_t ss_den; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( &rs_c, &cs_c ); \
|
||||
bli_negate_diag_offset( &diagoffc ); \
|
||||
bli_toggle_uplo( &uploc ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1. However,
|
||||
in the case of interleaved 3m, we need to scale by 3/2, and in the
|
||||
cases of real-only, imag-only, or summed-only, we need to scale by
|
||||
1/2. In both cases, we are compensating for the fact that pointer
|
||||
arithmetic occurs in terms of complex elements rather than real
|
||||
elements. */ \
|
||||
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
|
||||
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else { ss_num = 1; ss_den = 1; } \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (n_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
ip0 = n_iter - 1; \
|
||||
ip_inc = -1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip0 = 0; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_struc_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc_i < 0 ) || \
|
||||
( row_stored && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
|
||||
panel_len_max ); \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
/* We need to re-compute the imaginary stride as a function of
|
||||
panel_len_max_i since triangular packed matrices have panels
|
||||
of varying lengths. NOTE: This imaginary stride value is
|
||||
only referenced by the packm kernels for induced methods. */ \
|
||||
is_p_use = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the imaginary stride up by one if it is odd. */ \
|
||||
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
|
||||
\
|
||||
/* NOTE: We MUST use round-robin partitioning when packing
|
||||
micropanels of a triangular matrix. Hermitian/symmetric
|
||||
and general packing may use slab or round-robin, depending
|
||||
on which was selected at configure-time. */ \
|
||||
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ( is_p_use * ss_num ) / ss_den; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffc_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( BLIS_GENERAL, \
|
||||
0, \
|
||||
diagc, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
|
||||
|
||||
|
||||
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema ) ) { \
|
||||
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
|
||||
if ( col_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
if ( row_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
inc_t is_b = rs_p * *m_panel_max; \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
|
||||
@@ -1,737 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
pack_t schema,
|
||||
bool_t invdiag,
|
||||
bool_t revifup,
|
||||
bool_t reviflo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
inc_t is_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
void* packm_ker,
|
||||
cntx_t* cntx,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1rr);
|
||||
|
||||
|
||||
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
{
|
||||
/* float (0) scomplex (1) double (2) dcomplex (3) */
|
||||
// 0000 row/col panels
|
||||
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
|
||||
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
|
||||
// 0001 row/col panels: 4m interleaved
|
||||
{ { NULL, bli_cpackm_struc_cxk_4mi,
|
||||
NULL, bli_zpackm_struc_cxk_4mi, } },
|
||||
// 0010 row/col panels: 3m interleaved
|
||||
{ { NULL, bli_cpackm_struc_cxk_3mis,
|
||||
NULL, bli_zpackm_struc_cxk_3mis, } },
|
||||
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
|
||||
{ { NULL, NULL,
|
||||
NULL, NULL, } },
|
||||
// 0100 row/col panels: 3m separated
|
||||
{ { NULL, bli_cpackm_struc_cxk_3mis,
|
||||
NULL, bli_zpackm_struc_cxk_3mis, } },
|
||||
// 0101 row/col panels: real only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 0110 row/col panels: imaginary only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 0111 row/col panels: real+imaginary only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 1000 row/col panels: 1m-expanded (1e)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
// 1001 row/col panels: 1m-reordered (1r)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
};
|
||||
|
||||
|
||||
void bli_packm_blk_var1rr
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* t
|
||||
)
|
||||
{
|
||||
num_t dt_cp = bli_obj_dt( c );
|
||||
|
||||
struc_t strucc = bli_obj_struc( c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
diag_t diagc = bli_obj_diag( c );
|
||||
uplo_t uploc = bli_obj_uplo( c );
|
||||
trans_t transc = bli_obj_conjtrans_status( c );
|
||||
pack_t schema = bli_obj_pack_schema( p );
|
||||
bool_t invdiag = bli_obj_has_inverted_diag( p );
|
||||
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
|
||||
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
|
||||
|
||||
dim_t m_p = bli_obj_length( p );
|
||||
dim_t n_p = bli_obj_width( p );
|
||||
dim_t m_max_p = bli_obj_padded_length( p );
|
||||
dim_t n_max_p = bli_obj_padded_width( p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( p );
|
||||
inc_t rs_p = bli_obj_row_stride( p );
|
||||
inc_t cs_p = bli_obj_col_stride( p );
|
||||
inc_t is_p = bli_obj_imag_stride( p );
|
||||
dim_t pd_p = bli_obj_panel_dim( p );
|
||||
inc_t ps_p = bli_obj_panel_stride( p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
func_t* packm_kers;
|
||||
void* packm_ker;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// Treatment of kappa (ie: packing during scaling) depends on
|
||||
// whether we are executing an induced method.
|
||||
if ( bli_is_nat_packed( schema ) )
|
||||
{
|
||||
// This branch is for native execution, where we assume that
|
||||
// the micro-kernel will always apply the alpha scalar of the
|
||||
// higher-level operation. Thus, we use BLIS_ONE for kappa so
|
||||
// that the underlying packm implementation does not perform
|
||||
// any scaling during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
|
||||
}
|
||||
else // if ( bli_is_ind_packed( schema ) )
|
||||
{
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
//printf( "applying non-zero imag kappa\n" );
|
||||
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
|
||||
}
|
||||
|
||||
|
||||
// Choose the correct func_t object based on the pack_t schema.
|
||||
#if 0
|
||||
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
|
||||
else if ( bli_is_3mi_packed( schema ) ||
|
||||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
|
||||
else if ( bli_is_ro_packed( schema ) ||
|
||||
bli_is_io_packed( schema ) ||
|
||||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
|
||||
else packm_kers = packm_struc_cxk_kers;
|
||||
#else
|
||||
// The original idea here was to read the packm_ukr from the context
|
||||
// if it is non-NULL. The problem is, it requires that we be able to
|
||||
// assume that the packm_ukr field is initialized to NULL, which it
|
||||
// currently is not.
|
||||
|
||||
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
|
||||
|
||||
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// NULL (which is the default value after the context is created),
|
||||
// we use the default lookup table to determine the right func_t
|
||||
// for the current schema.
|
||||
const dim_t i = bli_pack_schema_index( schema );
|
||||
|
||||
packm_kers = &packm_struc_cxk_kers[ i ];
|
||||
}
|
||||
#if 0
|
||||
else // cntx's packm func_t overrides
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// non-NULL (ie: assumed to be valid), we use that instead.
|
||||
//packm_kers = bli_cntx_packm_ukrs( cntx );
|
||||
packm_kers = cntx_packm_kers;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_cp];
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
schema,
|
||||
invdiag,
|
||||
revifup,
|
||||
reviflo,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
is_p,
|
||||
pd_p, ps_p,
|
||||
packm_ker,
|
||||
cntx,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
cntx_t* cntx, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
|
||||
\
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict c_begin; \
|
||||
ctype* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
inc_t is_p_use; \
|
||||
dim_t ss_num; \
|
||||
dim_t ss_den; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( &rs_c, &cs_c ); \
|
||||
bli_negate_diag_offset( &diagoffc ); \
|
||||
bli_toggle_uplo( &uploc ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1. However,
|
||||
in the case of interleaved 3m, we need to scale by 3/2, and in the
|
||||
cases of real-only, imag-only, or summed-only, we need to scale by
|
||||
1/2. In both cases, we are compensating for the fact that pointer
|
||||
arithmetic occurs in terms of complex elements rather than real
|
||||
elements. */ \
|
||||
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
|
||||
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else { ss_num = 1; ss_den = 1; } \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (n_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
ip0 = n_iter - 1; \
|
||||
ip_inc = -1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip0 = 0; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_struc_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc_i < 0 ) || \
|
||||
( row_stored && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
|
||||
panel_len_max ); \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
/* We need to re-compute the imaginary stride as a function of
|
||||
panel_len_max_i since triangular packed matrices have panels
|
||||
of varying lengths. NOTE: This imaginary stride value is
|
||||
only referenced by the packm kernels for induced methods. */ \
|
||||
is_p_use = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the imaginary stride up by one if it is odd. */ \
|
||||
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
|
||||
\
|
||||
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ( is_p_use * ss_num ) / ss_den; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffc_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
/*
|
||||
printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
|
||||
*/ \
|
||||
packm_ker_cast( BLIS_GENERAL, \
|
||||
0, \
|
||||
diagc, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
} \
|
||||
/*
|
||||
printf( "thread %d: done\n", (int)tid ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1rr )
|
||||
|
||||
|
||||
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema ) ) { \
|
||||
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
|
||||
if ( col_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
if ( row_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
inc_t is_b = rs_p * *m_panel_max; \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
@@ -1,737 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
pack_t schema,
|
||||
bool_t invdiag,
|
||||
bool_t revifup,
|
||||
bool_t reviflo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
inc_t is_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
void* packm_ker,
|
||||
cntx_t* cntx,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1sl);
|
||||
|
||||
|
||||
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
{
|
||||
/* float (0) scomplex (1) double (2) dcomplex (3) */
|
||||
// 0000 row/col panels
|
||||
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
|
||||
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
|
||||
// 0001 row/col panels: 4m interleaved
|
||||
{ { NULL, bli_cpackm_struc_cxk_4mi,
|
||||
NULL, bli_zpackm_struc_cxk_4mi, } },
|
||||
// 0010 row/col panels: 3m interleaved
|
||||
{ { NULL, bli_cpackm_struc_cxk_3mis,
|
||||
NULL, bli_zpackm_struc_cxk_3mis, } },
|
||||
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
|
||||
{ { NULL, NULL,
|
||||
NULL, NULL, } },
|
||||
// 0100 row/col panels: 3m separated
|
||||
{ { NULL, bli_cpackm_struc_cxk_3mis,
|
||||
NULL, bli_zpackm_struc_cxk_3mis, } },
|
||||
// 0101 row/col panels: real only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 0110 row/col panels: imaginary only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 0111 row/col panels: real+imaginary only
|
||||
{ { NULL, bli_cpackm_struc_cxk_rih,
|
||||
NULL, bli_zpackm_struc_cxk_rih, } },
|
||||
// 1000 row/col panels: 1m-expanded (1e)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
// 1001 row/col panels: 1m-reordered (1r)
|
||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
};
|
||||
|
||||
|
||||
void bli_packm_blk_var1sl
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* t
|
||||
)
|
||||
{
|
||||
num_t dt_cp = bli_obj_dt( c );
|
||||
|
||||
struc_t strucc = bli_obj_struc( c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
diag_t diagc = bli_obj_diag( c );
|
||||
uplo_t uploc = bli_obj_uplo( c );
|
||||
trans_t transc = bli_obj_conjtrans_status( c );
|
||||
pack_t schema = bli_obj_pack_schema( p );
|
||||
bool_t invdiag = bli_obj_has_inverted_diag( p );
|
||||
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
|
||||
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
|
||||
|
||||
dim_t m_p = bli_obj_length( p );
|
||||
dim_t n_p = bli_obj_width( p );
|
||||
dim_t m_max_p = bli_obj_padded_length( p );
|
||||
dim_t n_max_p = bli_obj_padded_width( p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( p );
|
||||
inc_t rs_p = bli_obj_row_stride( p );
|
||||
inc_t cs_p = bli_obj_col_stride( p );
|
||||
inc_t is_p = bli_obj_imag_stride( p );
|
||||
dim_t pd_p = bli_obj_panel_dim( p );
|
||||
inc_t ps_p = bli_obj_panel_stride( p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
func_t* packm_kers;
|
||||
void* packm_ker;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// Treatment of kappa (ie: packing during scaling) depends on
|
||||
// whether we are executing an induced method.
|
||||
if ( bli_is_nat_packed( schema ) )
|
||||
{
|
||||
// This branch is for native execution, where we assume that
|
||||
// the micro-kernel will always apply the alpha scalar of the
|
||||
// higher-level operation. Thus, we use BLIS_ONE for kappa so
|
||||
// that the underlying packm implementation does not perform
|
||||
// any scaling during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
|
||||
}
|
||||
else // if ( bli_is_ind_packed( schema ) )
|
||||
{
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
//printf( "applying non-zero imag kappa\n" );
|
||||
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
|
||||
}
|
||||
|
||||
|
||||
// Choose the correct func_t object based on the pack_t schema.
|
||||
#if 0
|
||||
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
|
||||
else if ( bli_is_3mi_packed( schema ) ||
|
||||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
|
||||
else if ( bli_is_ro_packed( schema ) ||
|
||||
bli_is_io_packed( schema ) ||
|
||||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
|
||||
else packm_kers = packm_struc_cxk_kers;
|
||||
#else
|
||||
// The original idea here was to read the packm_ukr from the context
|
||||
// if it is non-NULL. The problem is, it requires that we be able to
|
||||
// assume that the packm_ukr field is initialized to NULL, which it
|
||||
// currently is not.
|
||||
|
||||
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
|
||||
|
||||
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// NULL (which is the default value after the context is created),
|
||||
// we use the default lookup table to determine the right func_t
|
||||
// for the current schema.
|
||||
const dim_t i = bli_pack_schema_index( schema );
|
||||
|
||||
packm_kers = &packm_struc_cxk_kers[ i ];
|
||||
}
|
||||
#if 0
|
||||
else // cntx's packm func_t overrides
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// non-NULL (ie: assumed to be valid), we use that instead.
|
||||
//packm_kers = bli_cntx_packm_ukrs( cntx );
|
||||
packm_kers = cntx_packm_kers;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_cp];
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
schema,
|
||||
invdiag,
|
||||
revifup,
|
||||
reviflo,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
is_p,
|
||||
pd_p, ps_p,
|
||||
packm_ker,
|
||||
cntx,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
cntx_t* cntx, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
|
||||
\
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict c_begin; \
|
||||
ctype* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
inc_t is_p_use; \
|
||||
dim_t ss_num; \
|
||||
dim_t ss_den; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( &rs_c, &cs_c ); \
|
||||
bli_negate_diag_offset( &diagoffc ); \
|
||||
bli_toggle_uplo( &uploc ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1. However,
|
||||
in the case of interleaved 3m, we need to scale by 3/2, and in the
|
||||
cases of real-only, imag-only, or summed-only, we need to scale by
|
||||
1/2. In both cases, we are compensating for the fact that pointer
|
||||
arithmetic occurs in terms of complex elements rather than real
|
||||
elements. */ \
|
||||
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
|
||||
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else { ss_num = 1; ss_den = 1; } \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (n_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
ip0 = n_iter - 1; \
|
||||
ip_inc = -1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip0 = 0; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_struc_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc_i < 0 ) || \
|
||||
( row_stored && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
|
||||
panel_len_max ); \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
/* We need to re-compute the imaginary stride as a function of
|
||||
panel_len_max_i since triangular packed matrices have panels
|
||||
of varying lengths. NOTE: This imaginary stride value is
|
||||
only referenced by the packm kernels for induced methods. */ \
|
||||
is_p_use = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the imaginary stride up by one if it is odd. */ \
|
||||
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
|
||||
\
|
||||
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ( is_p_use * ss_num ) / ss_den; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffc_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
/*
|
||||
printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
|
||||
*/ \
|
||||
packm_ker_cast( BLIS_GENERAL, \
|
||||
0, \
|
||||
diagc, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
} \
|
||||
/*
|
||||
printf( "thread %d: done\n", (int)tid ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1sl )
|
||||
|
||||
|
||||
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema ) ) { \
|
||||
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
|
||||
if ( col_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
if ( row_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
inc_t is_b = rs_p * *m_panel_max; \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
@@ -51,7 +51,18 @@
|
||||
\
|
||||
( start <= i && i < end )
|
||||
|
||||
// Define a general-purpose version of bli_packm_my_iter() whose definition
|
||||
// depends on whether slab or round-robin partitioning was requested at
|
||||
// configure-time.
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
|
||||
#define bli_packm_my_iter bli_packm_my_iter_sl
|
||||
|
||||
#else // BLIS_ENABLE_JRIR_RR
|
||||
|
||||
#define bli_packm_my_iter bli_packm_my_iter_rr
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -49,10 +49,8 @@ void PASTEMAC0(opname) \
|
||||
thrinfo_t* t \
|
||||
);
|
||||
|
||||
GENPROT( packm_unb_var1 )
|
||||
GENPROT( packm_blk_var1 )
|
||||
GENPROT( packm_blk_var1sl )
|
||||
GENPROT( packm_blk_var1rr )
|
||||
GENPROT( packm_unb_var1 )
|
||||
GENPROT( packm_blk_var1 )
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with void pointer operands.
|
||||
@@ -108,6 +106,5 @@ void PASTEMAC(ch,varname) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_blk_var1sl )
|
||||
INSERT_GENTPROT_BASIC0( packm_blk_var1rr )
|
||||
INSERT_GENTPROT_BASIC0( packm_blk_var1 )
|
||||
|
||||
|
||||
@@ -39,26 +39,32 @@
|
||||
|
||||
// gemm
|
||||
|
||||
// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
|
||||
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
|
||||
#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
|
||||
#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
|
||||
|
||||
// herk
|
||||
|
||||
// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to
|
||||
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
|
||||
#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
|
||||
#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
|
||||
|
||||
// trmm
|
||||
|
||||
// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
|
||||
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
|
||||
#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
|
||||
#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
|
||||
|
||||
#define bli_trmm_my_iter( index, thread ) \
|
||||
#define bli_trmm_my_iter_rr( index, thread ) \
|
||||
\
|
||||
( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
// trsm
|
||||
|
||||
#define bli_trsm_my_iter( index, thread ) \
|
||||
#define bli_trsm_my_iter_rr( index, thread ) \
|
||||
\
|
||||
( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
|
||||
@@ -58,30 +58,15 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
void* packa_fp;
|
||||
void* packb_fp;
|
||||
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
|
||||
// Use the function pointers to the macrokernels that use slab
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2sl;
|
||||
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2sl;
|
||||
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2sl;
|
||||
else macro_kernel_fp = NULL;
|
||||
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
|
||||
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2;
|
||||
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
|
||||
else /* should never execute */ macro_kernel_fp = NULL;
|
||||
|
||||
packa_fp = bli_packm_blk_var1sl;
|
||||
packb_fp = bli_packm_blk_var1sl;
|
||||
|
||||
#else // BLIS_ENABLE_JRIR_RR
|
||||
|
||||
// Use the function pointers to the macrokernels that use round-robin
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2rr;
|
||||
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2rr;
|
||||
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2rr;
|
||||
else macro_kernel_fp = NULL;
|
||||
|
||||
packa_fp = bli_packm_blk_var1rr;
|
||||
packb_fp = bli_packm_blk_var1rr;
|
||||
#endif
|
||||
packa_fp = bli_packm_blk_var1;
|
||||
packb_fp = bli_packm_blk_var1;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
|
||||
|
||||
@@ -116,8 +116,7 @@ void bli_gemm_int
|
||||
if ( im != BLIS_NAT )
|
||||
{
|
||||
if ( im == BLIS_4M1B )
|
||||
if ( f == bli_gemm_ker_var2sl ||
|
||||
f == bli_gemm_ker_var2rr ) f = bli_gemm4mb_ker_var2;
|
||||
if ( f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
379
frame/3/gemm/bli_gemm_ker_var2.c
Normal file
379
frame/3/gemm/bli_gemm_ker_var2.c
Normal file
@@ -0,0 +1,379 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
|
||||
|
||||
|
||||
void bli_gemm_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// If 1m is being employed on a column- or row-stored matrix with a
|
||||
// real-valued beta, we can use the real domain macro-kernel, which
|
||||
// eliminates a little overhead associated with the 1m virtual
|
||||
// micro-kernel.
|
||||
if ( bli_is_1m_packed( schema_a ) )
|
||||
{
|
||||
bli_l3_ind_recast_1m_params
|
||||
(
|
||||
dt_exec,
|
||||
schema_a,
|
||||
c,
|
||||
m, n, k,
|
||||
pd_a, ps_a,
|
||||
pd_b, ps_b,
|
||||
rs_c, cs_c
|
||||
);
|
||||
}
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd and 1st loops.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
|
||||
|
||||
@@ -60,8 +60,7 @@ GENPROT( gemm_packb )
|
||||
|
||||
GENPROT( gemm_ker_var1 )
|
||||
|
||||
GENPROT( gemm_ker_var2sl )
|
||||
GENPROT( gemm_ker_var2rr )
|
||||
GENPROT( gemm_ker_var2 )
|
||||
|
||||
// Headers for induced algorithms:
|
||||
GENPROT( gemm4mb_ker_var2 ) // 4m1b
|
||||
@@ -93,8 +92,7 @@ void PASTEMAC(ch,varname) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
|
||||
|
||||
// Headers for induced algorithms:
|
||||
INSERT_GENTPROT_BASIC0( gemm4mb_ker_var2 ) // 4m1b
|
||||
|
||||
555
frame/3/herk/bli_herk_l_ker_var2.c
Normal file
555
frame/3/herk/bli_herk_l_ker_var2.c
Normal file
@@ -0,0 +1,555 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T herk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffc,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
|
||||
|
||||
|
||||
void bli_herk_l_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffc,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffc, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t i, j, ip; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of C is entirely above the diagonal,
|
||||
it is not stored. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of C intersects
|
||||
the left edge of the panel, adjust the pointer to C and A and treat
|
||||
this case as if the diagonal offset were zero. */ \
|
||||
if ( diagoffc < 0 ) \
|
||||
{ \
|
||||
ip = -diagoffc / MR; \
|
||||
i = ip * MR; \
|
||||
m = m - i; \
|
||||
diagoffc = -diagoffc % MR; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
a_cast = a_cast + (ip )*ps_a; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region to the right of where the diagonal
|
||||
of C intersects the bottom of the panel, shrink it to prevent
|
||||
"no-op" iterations from executing. */ \
|
||||
if ( diagoffc + m < n ) \
|
||||
{ \
|
||||
n = diagoffc + m; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the rectangular
|
||||
part of C, and the triangular portion. */ \
|
||||
dim_t n_iter_rct; \
|
||||
dim_t n_iter_tri; \
|
||||
\
|
||||
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of C does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the second set of
|
||||
loops. */ \
|
||||
n_iter_rct = n_iter; \
|
||||
n_iter_tri = 0; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of C does intersect the diagonal, compute the number of
|
||||
iterations in the rectangular region by dividing NR into the diagonal
|
||||
offset. Any remainder from this integer division is discarded, which
|
||||
is what we want. That is, we want the rectangular region to contain
|
||||
as many columns of whole microtiles as possible without including any
|
||||
microtiles that intersect the diagonal. The number of iterations in
|
||||
the triangular (or trapezoidal) region is computed as the remaining
|
||||
number of iterations in the n dimension. */ \
|
||||
n_iter_rct = diagoffc / NR; \
|
||||
n_iter_tri = n_iter - n_iter_rct; \
|
||||
} \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd and 1st loops for
|
||||
the initial rectangular region of C (if it exists).
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* No need to compute the diagonal offset for the rectangular
|
||||
region. */ \
|
||||
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly below the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly above the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If there is no triangular region, then we're done. */ \
|
||||
if ( n_iter_tri == 0 ) return; \
|
||||
\
|
||||
/* Use round-robin assignment of micropanels to threads in the 2nd loop
|
||||
and the default (slab or rr) partitioning in the 1st loop for the
|
||||
remaining triangular region of C. */ \
|
||||
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Advance the start and end iteration offsets for the triangular region
|
||||
by the number of iterations used for the rectangular region. */ \
|
||||
jr_start += n_iter_rct; \
|
||||
jr_end += n_iter_rct; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly below the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly above the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
|
||||
|
||||
558
frame/3/herk/bli_herk_u_ker_var2.c
Normal file
558
frame/3/herk/bli_herk_u_ker_var2.c
Normal file
@@ -0,0 +1,558 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T herk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffc,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
|
||||
|
||||
|
||||
void bli_herk_u_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffc,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffc, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t i, j, jp; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of C is entirely below the diagonal,
|
||||
it is not stored. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of C
|
||||
intersects the top edge of the panel, adjust the pointer to C and B
|
||||
and treat this case as if the diagonal offset were zero.
|
||||
NOTE: It's possible that after this pruning that the diagonal offset
|
||||
is still positive (though it is guaranteed to be less than NR). */ \
|
||||
if ( diagoffc > 0 ) \
|
||||
{ \
|
||||
jp = diagoffc / NR; \
|
||||
j = jp * NR; \
|
||||
n = n - j; \
|
||||
diagoffc = diagoffc % NR; \
|
||||
c_cast = c_cast + (j )*cs_c; \
|
||||
b_cast = b_cast + (jp )*ps_b; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of C intersects
|
||||
the right edge of the panel, shrink it to prevent "no-op" iterations
|
||||
from executing. */ \
|
||||
if ( -diagoffc + n < m ) \
|
||||
{ \
|
||||
m = -diagoffc + n; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the triangular
|
||||
part of C, and the rectangular portion. */ \
|
||||
dim_t n_iter_tri; \
|
||||
dim_t n_iter_rct; \
|
||||
\
|
||||
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of C does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the first set of
|
||||
loops. */ \
|
||||
n_iter_tri = 0; \
|
||||
n_iter_rct = n_iter; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of C does intersect the diagonal, compute the number of
|
||||
iterations in the triangular (or trapezoidal) region by dividing NR
|
||||
into the number of rows in C. A non-zero remainder means we need to
|
||||
add one additional iteration. That is, we want the triangular region
|
||||
to contain as few columns of whole microtiles as possible while still
|
||||
including all microtiles that intersect the diagonal. The number of
|
||||
iterations in the rectangular region is computed as the remaining
|
||||
number of iterations in the n dimension. */ \
|
||||
n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
|
||||
n_iter_rct = n_iter - n_iter_tri; \
|
||||
} \
|
||||
\
|
||||
/* Use round-robin assignment of micropanels to threads in the 2nd loop
|
||||
and the default (slab or rr) partitioning in the 1st loop for the
|
||||
initial triangular region of C (if it exists). */ \
|
||||
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly above the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly below the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If there is no rectangular region, then we're done. */ \
|
||||
if ( n_iter_rct == 0 ) return; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd loop of the
|
||||
remaining rectangular region of C (and also use default partitioning
|
||||
for the 1st loop).
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Advance the start and end iteration offsets for the rectangular region
|
||||
by the number of iterations used for the triangular region. */ \
|
||||
jr_start += n_iter_tri; \
|
||||
jr_end += n_iter_tri; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* No need to compute the diagonal offset for the rectangular
|
||||
region. */ \
|
||||
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly above the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly below the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
|
||||
|
||||
@@ -56,13 +56,10 @@ void PASTEMAC0(opname) \
|
||||
//GENPROT( herk_blk_var2 )
|
||||
//GENPROT( herk_blk_var3 )
|
||||
|
||||
GENPROT( herk_x_ker_var2sl )
|
||||
GENPROT( herk_x_ker_var2rr )
|
||||
GENPROT( herk_x_ker_var2 )
|
||||
|
||||
GENPROT( herk_l_ker_var2sl )
|
||||
GENPROT( herk_l_ker_var2rr )
|
||||
GENPROT( herk_u_ker_var2sl )
|
||||
GENPROT( herk_u_ker_var2rr )
|
||||
GENPROT( herk_l_ker_var2 )
|
||||
GENPROT( herk_u_ker_var2 )
|
||||
//GENPROT( herk_packa )
|
||||
//GENPROT( herk_packb )
|
||||
|
||||
@@ -94,8 +91,6 @@ void PASTEMAC(ch,varname) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( herk_l_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( herk_l_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( herk_u_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( herk_u_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( herk_l_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( herk_u_ker_var2 )
|
||||
|
||||
|
||||
@@ -35,12 +35,12 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
static gemm_var_oft vars_sl[2] =
|
||||
static gemm_var_oft vars[2] =
|
||||
{
|
||||
bli_herk_l_ker_var2sl, bli_herk_u_ker_var2sl,
|
||||
bli_herk_l_ker_var2, bli_herk_u_ker_var2,
|
||||
};
|
||||
|
||||
void bli_herk_x_ker_var2sl
|
||||
void bli_herk_x_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* ah,
|
||||
@@ -59,48 +59,7 @@ void bli_herk_x_ker_var2sl
|
||||
else uplo = 1;
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars_sl[uplo];
|
||||
|
||||
// Call the macrokernel.
|
||||
f
|
||||
(
|
||||
a,
|
||||
ah,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static gemm_var_oft vars_rr[2] =
|
||||
{
|
||||
bli_herk_l_ker_var2rr, bli_herk_u_ker_var2rr,
|
||||
};
|
||||
|
||||
void bli_herk_x_ker_var2rr
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
bool_t uplo;
|
||||
gemm_var_oft f;
|
||||
|
||||
// Set a bool based on the uplo field of C's root object.
|
||||
if ( bli_obj_root_is_lower( c ) ) uplo = 0;
|
||||
else uplo = 1;
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars_rr[uplo];
|
||||
f = vars[uplo];
|
||||
|
||||
// Call the macrokernel.
|
||||
f
|
||||
|
||||
533
frame/3/trmm/bli_trmm_ll_ker_var2.c
Normal file
533
frame/3/trmm/bli_trmm_ll_ker_var2.c
Normal file
@@ -0,0 +1,533 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_ll_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1011; \
|
||||
dim_t off_a1011; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current block of A is entirely above the diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
left edge of the block, adjust the pointer to C and treat this case as
|
||||
if the diagonal offset were zero. This skips over the region that was
|
||||
not packed. (Note we assume the diagonal offset is a multiple of MR;
|
||||
this assumption will hold as long as the cache blocksizes are each a
|
||||
multiple of MR and NR.) */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
|
||||
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
/*dim_t ir_start, ir_end;*/ \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd loop.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. \
|
||||
NOTE: Parallelism in the 1st loop is disabled for now. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict b1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Determine the offset to and length of the panel that was
|
||||
packed so we can index into the corresponding location in
|
||||
b1. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bli_min( diagoffa_i + MR, k ); \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1011 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* NOTE: ir loop parallelism disabled for now. */ \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* NOTE: ir loop parallelism disabled for now. */ \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
|
||||
|
||||
541
frame/3/trmm/bli_trmm_lu_ker_var2.c
Normal file
541
frame/3/trmm/bli_trmm_lu_ker_var2.c
Normal file
@@ -0,0 +1,541 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_lu_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1112; \
|
||||
dim_t off_a1112; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current block of A is entirely below the diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
intersects the top edge of the block, adjust the pointer to B and
|
||||
treat this case as if the diagonal offset were zero. Note that we
|
||||
don't need to adjust the pointer to A since packm would have simply
|
||||
skipped over the region that was not stored. */ \
|
||||
if ( diagoffa > 0 ) \
|
||||
{ \
|
||||
i = diagoffa; \
|
||||
k = k - i; \
|
||||
diagoffa = 0; \
|
||||
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of A intersects the
|
||||
right side of the block, shrink it to prevent "no-op" iterations from
|
||||
executing. */ \
|
||||
if ( -diagoffa + k < m ) \
|
||||
{ \
|
||||
m = -diagoffa + k; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
|
||||
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
/*dim_t ir_start, ir_end;*/ \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd loop.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. \
|
||||
NOTE: Parallelism in the 1st loop is disabled for now. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, scale C
|
||||
by beta. If it is strictly above the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict b1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Determine the offset to and length of the panel that was
|
||||
packed so we can index into the corresponding location in
|
||||
b1. */ \
|
||||
off_a1112 = diagoffa_i; \
|
||||
k_a1112 = k - off_a1112; \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1112 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* NOTE: ir loop parallelism disabled for now. */ \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1112, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1112, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
/* NOTE: ir loop parallelism disabled for now. */ \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
|
||||
|
||||
598
frame/3/trmm/bli_trmm_rl_ker_var2.c
Normal file
598
frame/3/trmm/bli_trmm_rl_ker_var2.c
Normal file
@@ -0,0 +1,598 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffb,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_rl_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffb = bli_obj_diag_offset( b );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffb,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffb, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffb_j; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_b1121; \
|
||||
dim_t off_b1121; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_b_num; \
|
||||
inc_t ss_b_den; \
|
||||
inc_t ps_b_cur; \
|
||||
inc_t is_b_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of B is entirely above the diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of A (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_b ) || \
|
||||
bli_is_3mi_packed( schema_b ) || \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
the left edge of the panel, adjust the pointer to A and treat this
|
||||
case as if the diagonal offset were zero. Note that we don't need to
|
||||
adjust the pointer to B since packm would have simply skipped over
|
||||
the region that was not stored. */ \
|
||||
if ( diagoffb < 0 ) \
|
||||
{ \
|
||||
j = -diagoffb; \
|
||||
k = k - j; \
|
||||
diagoffb = 0; \
|
||||
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region to the right of where the diagonal
|
||||
of B intersects the bottom of the panel, shrink it to prevent
|
||||
"no-op" iterations from executing. */ \
|
||||
if ( diagoffb + k < n ) \
|
||||
{ \
|
||||
n = diagoffb + k; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k_full; \
|
||||
istep_b = PACKNR * k; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the rectangular
|
||||
part of B, and the triangular portion. */ \
|
||||
dim_t n_iter_rct; \
|
||||
dim_t n_iter_tri; \
|
||||
\
|
||||
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of B does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the second set of
|
||||
loops. */ \
|
||||
n_iter_rct = n_iter; \
|
||||
n_iter_tri = 0; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of B does intersect the diagonal, compute the number of
|
||||
iterations in the rectangular region by dividing NR into the diagonal
|
||||
offset. (There should never be any remainder in this division.) The
|
||||
number of iterations in the triangular (or trapezoidal) region is
|
||||
computed as the remaining number of iterations in the n dimension. */ \
|
||||
n_iter_rct = diagoffb / NR; \
|
||||
n_iter_tri = n_iter - n_iter_rct; \
|
||||
} \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd and 1st loops for
|
||||
the initial rectangular region of B (if it exists).
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. \
|
||||
NOTE: Parallelism in the 1st loop is disabled for now. */ \
|
||||
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
{ \
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If there is no triangular region, then we're done. */ \
|
||||
if ( n_iter_tri == 0 ) return; \
|
||||
\
|
||||
/* Use round-robin assignment of micropanels to threads in the 2nd and
|
||||
1st loops for the remaining triangular region of B (if it exists).
|
||||
NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
|
||||
employ a hack that calls for each thread to execute every iteration
|
||||
of the jr and ir loops but skip all but the pointer increment for
|
||||
iterations that are not assigned to it. */ \
|
||||
\
|
||||
/* Advance the starting b1 and c1 pointers to the positions corresponding
|
||||
to the start of the triangular region of B. */ \
|
||||
jr_start = n_iter_rct; \
|
||||
b1 = b_cast + jr_start * cstep_b; \
|
||||
c1 = c_cast + jr_start * cstep_c; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < n_iter; ++j ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
diagoffb_j = diagoffb - ( doff_t )j*NR; \
|
||||
\
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in A. Then compute the length of that panel. */ \
|
||||
off_b1121 = bli_max( -diagoffb_j, 0 ); \
|
||||
k_b1121 = k - off_b1121; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_b_cur = k_b1121 * PACKNR; \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
|
||||
\
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b1121, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b1121, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
|
||||
|
||||
618
frame/3/trmm/bli_trmm_ru_ker_var2.c
Normal file
618
frame/3/trmm/bli_trmm_ru_ker_var2.c
Normal file
@@ -0,0 +1,618 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffb,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_ru_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffb = bli_obj_diag_offset( b );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffb,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffb, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffb_j; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_b0111; \
|
||||
dim_t off_b0111; \
|
||||
dim_t i, j, jb0; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_b_num; \
|
||||
inc_t ss_b_den; \
|
||||
inc_t ps_b_cur; \
|
||||
inc_t is_b_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of B is entirely below its diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of A (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_b ) || \
|
||||
bli_is_3mi_packed( schema_b ) || \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
intersects the top edge of the panel, adjust the pointer to C and
|
||||
treat this case as if the diagonal offset were zero. This skips over
|
||||
the region that was not packed. (Note we assume the diagonal offset
|
||||
is a multiple of MR; this assumption will hold as long as the cache
|
||||
blocksizes are each a multiple of MR and NR.) */ \
|
||||
if ( diagoffb > 0 ) \
|
||||
{ \
|
||||
j = diagoffb; \
|
||||
n = n - j; \
|
||||
diagoffb = 0; \
|
||||
c_cast = c_cast + (j )*cs_c; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of B intersects the
|
||||
right side of the block, shrink it to prevent "no-op" iterations from
|
||||
executing. */ \
|
||||
if ( -diagoffb + n < k ) \
|
||||
{ \
|
||||
k = -diagoffb + n; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k_full; \
|
||||
istep_b = PACKNR * k; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the triangular
|
||||
part of C, and the rectangular portion. */ \
|
||||
dim_t n_iter_tri; \
|
||||
dim_t n_iter_rct; \
|
||||
\
|
||||
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of B does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the first set of
|
||||
loops. */ \
|
||||
n_iter_tri = 0; \
|
||||
n_iter_rct = n_iter; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of B does intersect the diagonal, compute the number of
|
||||
iterations in the triangular (or trapezoidal) region by dividing NR
|
||||
into the number of rows in B. (There should never be any remainder
|
||||
in this division.) The number of iterations in the rectangular region
|
||||
is computed as the remaining number of iterations in the n dimension. */ \
|
||||
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
|
||||
n_iter_rct = n_iter - n_iter_tri; \
|
||||
} \
|
||||
\
|
||||
/* Use round-robin assignment of micropanels to threads in the 2nd and
|
||||
1st loops for the initial triangular region of B (if it exists).
|
||||
NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
|
||||
employ a hack that calls for each thread to execute every iteration
|
||||
of the jr and ir loops but skip all but the pointer increment for
|
||||
iterations that are not assigned to it. */ \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter_tri; ++j ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
diagoffb_j = diagoffb - ( doff_t )j*NR; \
|
||||
\
|
||||
/* Determine the offset to and length of the panel that was packed
|
||||
so we can index into the corresponding location in A. */ \
|
||||
off_b0111 = 0; \
|
||||
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_b_cur = k_b0111 * PACKNR; \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
|
||||
\
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b0111, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b0111, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/* If there is no rectangular region, then we're done. */ \
|
||||
if ( n_iter_rct == 0 ) return; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd and 1st loops for
|
||||
the remaining rectangular region of B.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. \
|
||||
NOTE: Parallelism in the 1st loop is disabled for now. */ \
|
||||
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Advance the start and end iteration offsets for the rectangular region
|
||||
by the number of iterations used for the triangular region. */ \
|
||||
jr_start += n_iter_tri; \
|
||||
jr_end += n_iter_tri; \
|
||||
jb0 = n_iter_tri; \
|
||||
\
|
||||
/* Save the resulting value of b1 from the previous loop since it represents
|
||||
the starting point for the rectangular region. */ \
|
||||
b_cast = b1; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
/* NOTE: We must index through b_cast differently since it contains
|
||||
the starting address of the rectangular region (which is already
|
||||
n_iter_tri logical iterations through B). */ \
|
||||
b1 = b_cast + (j-jb0) * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
{ \
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
|
||||
|
||||
@@ -56,17 +56,12 @@ void PASTEMAC0(opname) \
|
||||
//GENPROT( trmm_blk_var2 )
|
||||
//GENPROT( trmm_blk_var3 )
|
||||
|
||||
GENPROT( trmm_xx_ker_var2sl )
|
||||
GENPROT( trmm_xx_ker_var2rr )
|
||||
GENPROT( trmm_xx_ker_var2 )
|
||||
|
||||
GENPROT( trmm_ll_ker_var2sl )
|
||||
GENPROT( trmm_ll_ker_var2rr )
|
||||
GENPROT( trmm_lu_ker_var2sl )
|
||||
GENPROT( trmm_lu_ker_var2rr )
|
||||
GENPROT( trmm_rl_ker_var2sl )
|
||||
GENPROT( trmm_rl_ker_var2rr )
|
||||
GENPROT( trmm_ru_ker_var2sl )
|
||||
GENPROT( trmm_ru_ker_var2rr )
|
||||
GENPROT( trmm_ll_ker_var2 )
|
||||
GENPROT( trmm_lu_ker_var2 )
|
||||
GENPROT( trmm_rl_ker_var2 )
|
||||
GENPROT( trmm_ru_ker_var2 )
|
||||
|
||||
|
||||
//
|
||||
@@ -96,12 +91,8 @@ void PASTEMAC(ch,varname) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 )
|
||||
|
||||
|
||||
@@ -35,13 +35,13 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
static gemm_var_oft vars_sl[2][2] =
|
||||
static gemm_var_oft vars[2][2] =
|
||||
{
|
||||
{ bli_trmm_ll_ker_var2sl, bli_trmm_lu_ker_var2sl },
|
||||
{ bli_trmm_rl_ker_var2sl, bli_trmm_ru_ker_var2sl }
|
||||
{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
|
||||
{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
|
||||
};
|
||||
|
||||
void bli_trmm_xx_ker_var2sl
|
||||
void bli_trmm_xx_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -73,62 +73,7 @@ void bli_trmm_xx_ker_var2sl
|
||||
}
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars_sl[side][uplo];
|
||||
|
||||
// Call the macrokernel.
|
||||
f
|
||||
(
|
||||
a,
|
||||
b,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static gemm_var_oft vars_rr[2][2] =
|
||||
{
|
||||
{ bli_trmm_ll_ker_var2rr, bli_trmm_lu_ker_var2rr },
|
||||
{ bli_trmm_rl_ker_var2rr, bli_trmm_ru_ker_var2rr }
|
||||
};
|
||||
|
||||
void bli_trmm_xx_ker_var2rr
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
bool_t side;
|
||||
bool_t uplo;
|
||||
gemm_var_oft f;
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
// of the root object) and one based on the uplo field of the triangular
|
||||
// matrix's root object (whether that is matrix A or matrix B).
|
||||
if ( bli_obj_root_is_triangular( a ) )
|
||||
{
|
||||
side = 0;
|
||||
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
|
||||
else uplo = 1;
|
||||
}
|
||||
else // if ( bli_obj_root_is_triangular( b ) )
|
||||
{
|
||||
side = 1;
|
||||
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
|
||||
else uplo = 1;
|
||||
}
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars_rr[side][uplo];
|
||||
f = vars[side][uplo];
|
||||
|
||||
// Call the macrokernel.
|
||||
f
|
||||
|
||||
@@ -58,24 +58,12 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
void* packa_fp;
|
||||
void* packb_fp;
|
||||
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
|
||||
// Use the function pointer to the macrokernels that use slab
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
macro_kernel_p = bli_trsm_xx_ker_var2sl;
|
||||
macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
|
||||
packa_fp = bli_packm_blk_var1sl;
|
||||
packb_fp = bli_packm_blk_var1sl;
|
||||
|
||||
#else // BLIS_ENABLE_JRIR_RR
|
||||
|
||||
// Use the function pointer to the macrokernels that use round-robin
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
macro_kernel_p = bli_trsm_xx_ker_var2rr;
|
||||
|
||||
packa_fp = bli_packm_blk_var1rr;
|
||||
packb_fp = bli_packm_blk_var1rr;
|
||||
#endif
|
||||
packa_fp = bli_packm_blk_var1;
|
||||
packb_fp = bli_packm_blk_var1;
|
||||
|
||||
const opid_t family = BLIS_TRSM;
|
||||
|
||||
@@ -162,16 +150,11 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
pack_t schema_b
|
||||
)
|
||||
{
|
||||
// trsm macrokernels are presently disabled for right-side execution,
|
||||
// so it doesn't matter which function pointer we use here (sl or rr).
|
||||
// To be safe, we'll insert an abort() guard to alert the developers
|
||||
// of this should right-side macrokernels ever be re-enabled.
|
||||
void* macro_kernel_p = bli_trsm_xx_ker_var2sl;
|
||||
// NOTE: trsm macrokernels are presently disabled for right-side execution.
|
||||
void* macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
|
||||
void* packa_fp = bli_packm_blk_var1sl;
|
||||
void* packb_fp = bli_packm_blk_var1sl;
|
||||
|
||||
bli_abort();
|
||||
void* packa_fp = bli_packm_blk_var1;
|
||||
void* packb_fp = bli_packm_blk_var1;
|
||||
|
||||
const opid_t family = BLIS_TRSM;
|
||||
|
||||
|
||||
604
frame/3/trsm/bli_trsm_ll_ker_var2.c
Normal file
604
frame/3/trsm/bli_trsm_ll_ker_var2.c
Normal file
@@ -0,0 +1,604 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha1,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
|
||||
|
||||
|
||||
void bli_trsm_ll_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_alpha1;
|
||||
void* buf_alpha2;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to B (the non-triangular matrix). This will be the alpha
|
||||
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
|
||||
// be applied to the packed copy of B prior to it being updated by
|
||||
// the trsm subproblem). This scalar may be unit, if for example it
|
||||
// was applied during packing.
|
||||
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to C. This will be the "beta" scalar used in the gemm-only
|
||||
// subproblems that correspond to micro-panels that do not intersect
|
||||
// the diagonal. We need this separate scalar because it's possible
|
||||
// that the alpha attached to B was reset, if it was applied during
|
||||
// packing.
|
||||
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha1,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha1, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha1_cast = alpha1; \
|
||||
ctype* restrict alpha2_cast = alpha2; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1011; \
|
||||
dim_t k_a10; \
|
||||
dim_t off_a10; \
|
||||
dim_t off_a11; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full as k inflated up to a multiple of MR. This is
|
||||
needed because some parameter combinations of trsm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
left edge of the block, adjust the pointer to C and treat this case as
|
||||
if the diagonal offset were zero. This skips over the region that was
|
||||
not packed. (Note we assume the diagonal offset is a multiple of MR;
|
||||
this assumption will hold as long as the cache blocksizes are each a
|
||||
multiple of MR and NR.) */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Check the k dimension, which needs to be a multiple of MR. If k
|
||||
isn't a multiple of MR, we adjust it higher to satisfy the micro-
|
||||
kernel, which is expecting to perform an MR x MR triangular solve.
|
||||
This adjustment of k is consistent with what happened when A was
|
||||
packed: all of its bottom/right edges were zero-padded, and
|
||||
furthermore, the panel that stores the bottom-right corner of the
|
||||
matrix has its diagonal extended into the zero-padded region (as
|
||||
identity). This allows the trsm of that bottom-right panel to
|
||||
proceed without producing any infs or NaNs that would infect the
|
||||
"good" values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
|
||||
know that the underlying buffer was already allocated to have an m
|
||||
dimension that is a multiple of PACKMR, with the region between the
|
||||
last row and the next multiple of MR zero-padded accordingly. */ \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* We don't bother querying the thrinfo_t node for the 1st loop because
|
||||
we can't parallelize that loop in trsm due to the inter-iteration
|
||||
dependencies that exist. */ \
|
||||
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd loop.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time.
|
||||
NOTE: Parallelism in the 1st loop is unattainable due to the
|
||||
inter-iteration dependencies present in trsm. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (0 )*rstep_c; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides below the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is above the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict b01; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute various offsets into and lengths of parts of A. */ \
|
||||
off_a10 = 0; \
|
||||
k_a1011 = diagoffa_i + MR; \
|
||||
k_a10 = k_a1011 - MR; \
|
||||
off_a11 = k_a10; \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1011 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* Compute the addresses of the panel A10 and the triangular
|
||||
block A11. */ \
|
||||
a10 = a1; \
|
||||
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
|
||||
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
|
||||
\
|
||||
/* Compute the addresses of the panel B01 and the block
|
||||
B11. */ \
|
||||
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
|
||||
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + ps_a_cur; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a10, \
|
||||
alpha1_cast, \
|
||||
a10, \
|
||||
a11, \
|
||||
b01, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a10, \
|
||||
alpha1_cast, \
|
||||
a10, \
|
||||
a11, \
|
||||
b01, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema_a ) ){ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
|
||||
( double* )b, rs_b, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
|
||||
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
|
||||
}else{ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
|
||||
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
|
||||
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
|
||||
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema_a ) ){ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
|
||||
( double* )b, rs_b, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
|
||||
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
|
||||
}else{ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
|
||||
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
|
||||
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
} \
|
||||
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
|
||||
( double* )c, 1, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
|
||||
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
|
||||
|
||||
585
frame/3/trsm/bli_trsm_lu_ker_var2.c
Normal file
585
frame/3/trsm/bli_trsm_lu_ker_var2.c
Normal file
@@ -0,0 +1,585 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha1,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
|
||||
|
||||
|
||||
void bli_trsm_lu_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_alpha1;
|
||||
void* buf_alpha2;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to B (the non-triangular matrix). This will be the alpha
|
||||
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
|
||||
// be applied to the packed copy of B prior to it being updated by
|
||||
// the trsm subproblem). This scalar may be unit, if for example it
|
||||
// was applied during packing.
|
||||
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to C. This will be the "beta" scalar used in the gemm-only
|
||||
// subproblems that correspond to micro-panels that do not intersect
|
||||
// the diagonal. We need this separate scalar because it's possible
|
||||
// that the alpha attached to B was reset, if it was applied during
|
||||
// packing.
|
||||
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha1,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha1, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha1_cast = alpha1; \
|
||||
ctype* restrict alpha2_cast = alpha2; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1112; \
|
||||
dim_t k_a11; \
|
||||
dim_t k_a12; \
|
||||
dim_t off_a11; \
|
||||
dim_t off_a12; \
|
||||
dim_t i, j, ib; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full as k inflated up to a multiple of MR. This is
|
||||
needed because some parameter combinations of trsm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
intersects the top edge of the block, adjust the pointer to B and
|
||||
treat this case as if the diagonal offset were zero. Note that we
|
||||
don't need to adjust the pointer to A since packm would have simply
|
||||
skipped over the region that was not stored. */ \
|
||||
if ( diagoffa > 0 ) \
|
||||
{ \
|
||||
i = diagoffa; \
|
||||
k = k - i; \
|
||||
diagoffa = 0; \
|
||||
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of A intersects the
|
||||
right side of the block, shrink it to prevent "no-op" iterations from
|
||||
executing. */ \
|
||||
if ( -diagoffa + k < m ) \
|
||||
{ \
|
||||
m = -diagoffa + k; \
|
||||
} \
|
||||
\
|
||||
/* Check the k dimension, which needs to be a multiple of MR. If k
|
||||
isn't a multiple of MR, we adjust it higher to satisfy the micro-
|
||||
kernel, which is expecting to perform an MR x MR triangular solve.
|
||||
This adjustment of k is consistent with what happened when A was
|
||||
packed: all of its bottom/right edges were zero-padded, and
|
||||
furthermore, the panel that stores the bottom-right corner of the
|
||||
matrix has its diagonal extended into the zero-padded region (as
|
||||
identity). This allows the trsm of that bottom-right panel to
|
||||
proceed without producing any infs or NaNs that would infect the
|
||||
"good" values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
|
||||
know that the underlying buffer was already allocated to have an m
|
||||
dimension that is a multiple of PACKMR, with the region between the
|
||||
last row and the next multiple of MR zero-padded accordingly. */ \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* We don't bother querying the thrinfo_t node for the 1st loop because
|
||||
we can't parallelize that loop in trsm due to the inter-iteration
|
||||
dependencies that exist. */ \
|
||||
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd loop.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time.
|
||||
NOTE: Parallelism in the 1st loop is unattainable due to the
|
||||
inter-iteration dependencies present in trsm. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (m_iter-1)*rstep_c; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( ib = 0; ib < m_iter; ++ib ) \
|
||||
{ \
|
||||
i = m_iter - 1 - ib; \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides above the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is below the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict a12; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict b21; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute various offsets into and lengths of parts of A. */ \
|
||||
off_a11 = diagoffa_i; \
|
||||
k_a1112 = k - off_a11;; \
|
||||
k_a11 = MR; \
|
||||
k_a12 = k_a1112 - MR; \
|
||||
off_a12 = off_a11 + k_a11; \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1112 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* Compute the addresses of the triangular block A11 and the
|
||||
panel A12. */ \
|
||||
a11 = a1; \
|
||||
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
|
||||
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
|
||||
\
|
||||
/* Compute the addresses of the panel B01 and the block
|
||||
B11. */ \
|
||||
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
|
||||
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + ps_a_cur; \
|
||||
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a12, \
|
||||
alpha1_cast, \
|
||||
a12, \
|
||||
a11, \
|
||||
b21, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a12, \
|
||||
alpha1_cast, \
|
||||
a12, \
|
||||
a11, \
|
||||
b21, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 -= rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
|
||||
printf( "m_iter = %lu\n", m_iter ); \
|
||||
printf( "m_cur = %lu\n", m_cur ); \
|
||||
printf( "k = %lu\n", k ); \
|
||||
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
|
||||
printf( "off_a1112 = %lu\n", off_a1112 ); \
|
||||
printf( "k_a1112 = %lu\n", k_a1112 ); \
|
||||
printf( "k_a12 = %lu\n", k_a12 ); \
|
||||
printf( "k_a11 = %lu\n", k_a11 ); \
|
||||
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
|
||||
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
|
||||
|
||||
@@ -428,7 +428,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict a12; \
|
||||
@@ -514,7 +514,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
|
||||
@@ -421,7 +421,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
@@ -507,7 +507,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
|
||||
@@ -58,14 +58,10 @@ GENPROT( trsm_blk_var3 )
|
||||
GENPROT( trsm_packa )
|
||||
GENPROT( trsm_packb )
|
||||
|
||||
GENPROT( trsm_xx_ker_var2sl )
|
||||
GENPROT( trsm_xx_ker_var2rr )
|
||||
|
||||
GENPROT( trsm_ll_ker_var2sl )
|
||||
GENPROT( trsm_ll_ker_var2rr )
|
||||
GENPROT( trsm_lu_ker_var2sl )
|
||||
GENPROT( trsm_lu_ker_var2rr )
|
||||
GENPROT( trsm_xx_ker_var2 )
|
||||
|
||||
GENPROT( trsm_ll_ker_var2 )
|
||||
GENPROT( trsm_lu_ker_var2 )
|
||||
GENPROT( trsm_rl_ker_var2 )
|
||||
GENPROT( trsm_ru_ker_var2 )
|
||||
|
||||
@@ -97,11 +93,8 @@ void PASTEMAC(ch,varname) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2rr )
|
||||
|
||||
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )
|
||||
|
||||
|
||||
@@ -35,13 +35,13 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
static trsm_var_oft vars_sl[2][2] =
|
||||
static trsm_var_oft vars[2][2] =
|
||||
{
|
||||
{ bli_trsm_ll_ker_var2sl, bli_trsm_lu_ker_var2sl },
|
||||
{ bli_trsm_rl_ker_var2 , bli_trsm_ru_ker_var2 }
|
||||
{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
|
||||
{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
|
||||
};
|
||||
|
||||
void bli_trsm_xx_ker_var2sl
|
||||
void bli_trsm_xx_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -73,62 +73,7 @@ void bli_trsm_xx_ker_var2sl
|
||||
}
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars_sl[side][uplo];
|
||||
|
||||
// Call the macrokernel.
|
||||
f
|
||||
(
|
||||
a,
|
||||
b,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static trsm_var_oft vars_rr[2][2] =
|
||||
{
|
||||
{ bli_trsm_ll_ker_var2rr, bli_trsm_lu_ker_var2rr },
|
||||
{ bli_trsm_rl_ker_var2 , bli_trsm_ru_ker_var2 }
|
||||
};
|
||||
|
||||
void bli_trsm_xx_ker_var2rr
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
bool_t side;
|
||||
bool_t uplo;
|
||||
trsm_var_oft f;
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
// of the root object) and one based on the uplo field of the triangular
|
||||
// matrix's root object (whether that is matrix A or matrix B).
|
||||
if ( bli_obj_root_is_triangular( a ) )
|
||||
{
|
||||
side = 0;
|
||||
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
|
||||
else uplo = 1;
|
||||
}
|
||||
else // if ( bli_obj_root_is_triangular( b ) )
|
||||
{
|
||||
side = 1;
|
||||
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
|
||||
else uplo = 1;
|
||||
}
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars_rr[side][uplo];
|
||||
f = vars[side][uplo];
|
||||
|
||||
// Call the macrokernel.
|
||||
f
|
||||
|
||||
@@ -804,6 +804,15 @@ static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth
|
||||
( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
|
||||
}
|
||||
|
||||
static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
return bli_is_last_iter_sl( i, end_iter, tid, nth );
|
||||
#else // BLIS_ENABLE_JRIR_RR
|
||||
return bli_is_last_iter_rr( i, end_iter, tid, nth );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// packbuf_t-related
|
||||
|
||||
|
||||
@@ -59,32 +59,6 @@ void bli_thread_finalize( void )
|
||||
{
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
#if 0
|
||||
void bli_thread_range_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
)
|
||||
{
|
||||
//#ifdef BLIS_JRIR_INTERLEAVE
|
||||
#if 1
|
||||
// Use interleaved partitioning of jr/ir loops.
|
||||
*start = bli_thread_work_id( thread );
|
||||
*inc = bli_thread_n_way( thread );
|
||||
*end = n;
|
||||
#else
|
||||
// Use contiguous slab partitioning for jr/ir loops.
|
||||
bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
|
||||
*inc = 1;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_thread_range_sub
|
||||
|
||||
@@ -57,19 +57,6 @@ void bli_thread_finalize( void );
|
||||
#endif
|
||||
|
||||
// Thread range-related prototypes.
|
||||
#if 0
|
||||
void bli_thread_range_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
);
|
||||
#endif
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_thread_range_sub
|
||||
(
|
||||
@@ -261,7 +248,6 @@ static void bli_thread_range_jrir_sl
|
||||
*inc = 1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void bli_thread_range_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
@@ -273,6 +259,9 @@ static void bli_thread_range_jrir
|
||||
dim_t* inc
|
||||
)
|
||||
{
|
||||
// Define a general-purpose version of bli_thread_range_jrir() whose
|
||||
// definition depends on whether slab or round-robin partitioning was
|
||||
// requested at configure-time.
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
|
||||
#else
|
||||
@@ -280,6 +269,7 @@ static void bli_thread_range_jrir
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void bli_thread_range_weighted_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
|
||||
@@ -59,24 +59,10 @@ cntl_t* blx_gemmbp_cntl_create
|
||||
void* packa_fp;
|
||||
void* packb_fp;
|
||||
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
macro_kernel_fp = blx_gemm_ker_var2;
|
||||
|
||||
// Use the function pointers to the macrokernels that use slab
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
macro_kernel_fp = blx_gemm_ker_var2sl;
|
||||
|
||||
packa_fp = bli_packm_blk_var1sl;
|
||||
packb_fp = bli_packm_blk_var1sl;
|
||||
|
||||
#else // BLIS_ENABLE_JRIR_RR
|
||||
|
||||
// Use the function pointers to the macrokernels that use round-robin
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
macro_kernel_fp = bli_gemm_ker_var2rr;
|
||||
|
||||
packa_fp = bli_packm_blk_var1rr;
|
||||
packb_fp = bli_packm_blk_var1rr;
|
||||
#endif
|
||||
packa_fp = bli_packm_blk_var1;
|
||||
packb_fp = bli_packm_blk_var1;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
|
||||
|
||||
375
sandbox/ref99/vars/blx_gemm_ker_var2.c
Normal file
375
sandbox/ref99/vars/blx_gemm_ker_var2.c
Normal file
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "blix.h"
|
||||
|
||||
// Function pointer type for datatype-specific functions.
|
||||
typedef void (*gemm_fp)
|
||||
(
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Function pointer array for datatype-specific functions.
|
||||
static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
PASTECH2(blx_,s,gemm_ker_var2),
|
||||
PASTECH2(blx_,c,gemm_ker_var2),
|
||||
PASTECH2(blx_,d,gemm_ker_var2),
|
||||
PASTECH2(blx_,z,gemm_ker_var2)
|
||||
};
|
||||
|
||||
|
||||
void blx_gemm_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
gemm_fp f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(blx_,ch,varname) \
|
||||
( \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd and 1st loops.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
#if 0
|
||||
GENTFUNC( float, s, gemm_ker_var2 )
|
||||
GENTFUNC( double, d, gemm_ker_var2 )
|
||||
GENTFUNC( scomplex, c, gemm_ker_var2 )
|
||||
GENTFUNC( dcomplex, z, gemm_ker_var2 )
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
|
||||
#endif
|
||||
|
||||
@@ -58,8 +58,7 @@ GENPROT( gemm_blk_var3 )
|
||||
GENPROT( gemm_packa )
|
||||
GENPROT( gemm_packb )
|
||||
|
||||
GENPROT( gemm_ker_var2sl )
|
||||
GENPROT( gemm_ker_var2rr )
|
||||
GENPROT( gemm_ker_var2 )
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with void pointer operands.
|
||||
@@ -87,6 +86,5 @@ void PASTECH2(blx_,ch,varname) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user