Consolidated slab/rr-explicit level-3 macrokernels.

Details:
- Consolidated the *sl.c and *rr.c level-3 macrokernels into a single
  file per sl/rr pair, with those files named as they were before
  c92762e. The consolidation does not take away the *option* of using
  slab or round-robin assignment of micropanels to threads; it merely
  *hides* the choice within the definitions of functions such as
  bli_thread_range_jrir(), bli_packm_my_iter(), and bli_is_last_iter()
  rather than expose that choice explicitly in the code. The choice of
  slab or rr is not always hidden, however; there are some cases
  involving herk and trmm, for example, that require some part of the
  computation to use rr unconditionally. (The --thread-part-jrir option
  controls the partitioning in all other cases.)
- Note: Originally, the sl and rr macrokernels were separated out for
  clarity. However, aside from the additional binary code bloat, I later
  deemed that clarity not worth the price of maintaining the additional
  (mostly similar) codes.
This commit is contained in:
Field G. Van Zee
2018-10-17 14:11:01 -05:00
parent 57eab3a4f0
commit 71c5832d5f
53 changed files with 6127 additions and 1807 deletions

View File

@@ -35,6 +35,72 @@
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)
(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 4m interleaved
{ { NULL, bli_cpackm_struc_cxk_4mi,
NULL, bli_zpackm_struc_cxk_4mi, } },
// 0010 row/col panels: 3m interleaved
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
{ { NULL, NULL,
NULL, NULL, } },
// 0100 row/col panels: 3m separated
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0101 row/col panels: real only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0110 row/col panels: imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
void bli_packm_blk_var1
(
obj_t* c,
@@ -44,14 +110,632 @@ void bli_packm_blk_var1
thrinfo_t* t
)
{
#ifdef BLIS_ENABLE_JRIR_SLAB
num_t dt_cp = bli_obj_dt( c );
bli_packm_blk_var1sl( c, p, cntx, cntl, t );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool_t invdiag = bli_obj_has_inverted_diag( p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
#else // BLIS_ENABLE_JRIR_RR
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
bli_packm_blk_var1rr( c, p, cntx, cntl, t );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
}
else // if ( bli_is_ind_packed( schema ) )
{
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
//printf( "applying non-zero imag kappa\n" );
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = κ
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
}
// Choose the correct func_t object based on the pack_t schema.
#if 0
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
#else
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
// currently is not.
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
{
// If the packm structure-aware kernel func_t in the context is
// NULL (which is the default value after the context is created),
// we use the default lookup table to determine the right func_t
// for the current schema.
const dim_t i = bli_pack_schema_index( schema );
packm_kers = &packm_struc_cxk_kers[ i ];
}
#if 0
else // cntx's packm func_t overrides
{
// If the packm structure-aware kernel func_t in the context is
// non-NULL (ie: assumed to be valid), we use that instead.
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
#endif
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
cntx,
t );
}
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (n_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = n_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
/* NOTE: We MUST use round-robin partitioning when packing
micropanels of a triangular matrix. Hermitian/symmetric
and general packing may use slab or round-robin, depending
on which was selected at configure-time. */ \
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
} \
}
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/

View File

@@ -1,737 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)
(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1rr);
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 4m interleaved
{ { NULL, bli_cpackm_struc_cxk_4mi,
NULL, bli_zpackm_struc_cxk_4mi, } },
// 0010 row/col panels: 3m interleaved
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
{ { NULL, NULL,
NULL, NULL, } },
// 0100 row/col panels: 3m separated
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0101 row/col panels: real only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0110 row/col panels: imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
void bli_packm_blk_var1rr
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* t
)
{
num_t dt_cp = bli_obj_dt( c );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool_t invdiag = bli_obj_has_inverted_diag( p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
}
else // if ( bli_is_ind_packed( schema ) )
{
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
//printf( "applying non-zero imag kappa\n" );
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
}
// Choose the correct func_t object based on the pack_t schema.
#if 0
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
#else
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
// currently is not.
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
{
// If the packm structure-aware kernel func_t in the context is
// NULL (which is the default value after the context is created),
// we use the default lookup table to determine the right func_t
// for the current schema.
const dim_t i = bli_pack_schema_index( schema );
packm_kers = &packm_struc_cxk_kers[ i ];
}
#if 0
else // cntx's packm func_t overrides
{
// If the packm structure-aware kernel func_t in the context is
// non-NULL (ie: assumed to be valid), we use that instead.
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
#endif
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
cntx,
t );
}
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (n_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = n_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
/*
printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
*/ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
} \
/*
printf( "thread %d: done\n", (int)tid ); \
*/ \
}
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1rr )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/

View File

@@ -1,737 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)
(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1sl);
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 4m interleaved
{ { NULL, bli_cpackm_struc_cxk_4mi,
NULL, bli_zpackm_struc_cxk_4mi, } },
// 0010 row/col panels: 3m interleaved
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
{ { NULL, NULL,
NULL, NULL, } },
// 0100 row/col panels: 3m separated
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0101 row/col panels: real only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0110 row/col panels: imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
void bli_packm_blk_var1sl
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* t
)
{
num_t dt_cp = bli_obj_dt( c );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool_t invdiag = bli_obj_has_inverted_diag( p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
}
else // if ( bli_is_ind_packed( schema ) )
{
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
//printf( "applying non-zero imag kappa\n" );
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
}
// Choose the correct func_t object based on the pack_t schema.
#if 0
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
#else
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
// currently is not.
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
{
// If the packm structure-aware kernel func_t in the context is
// NULL (which is the default value after the context is created),
// we use the default lookup table to determine the right func_t
// for the current schema.
const dim_t i = bli_pack_schema_index( schema );
packm_kers = &packm_struc_cxk_kers[ i ];
}
#if 0
else // cntx's packm func_t overrides
{
// If the packm structure-aware kernel func_t in the context is
// non-NULL (ie: assumed to be valid), we use that instead.
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
#endif
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
cntx,
t );
}
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (n_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = n_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
{ \
/*
printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
*/ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
} \
/*
printf( "thread %d: done\n", (int)tid ); \
*/ \
}
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1sl )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/

View File

@@ -51,7 +51,18 @@
\
( start <= i && i < end )
// Define a general-purpose version of bli_packm_my_iter() whose definition
// depends on whether slab or round-robin partitioning was requested at
// configure-time.
#ifdef BLIS_ENABLE_JRIR_SLAB
#define bli_packm_my_iter bli_packm_my_iter_sl
#else // BLIS_ENABLE_JRIR_RR
#define bli_packm_my_iter bli_packm_my_iter_rr
#endif
//

View File

@@ -49,10 +49,8 @@ void PASTEMAC0(opname) \
thrinfo_t* t \
);
GENPROT( packm_unb_var1 )
GENPROT( packm_blk_var1 )
GENPROT( packm_blk_var1sl )
GENPROT( packm_blk_var1rr )
GENPROT( packm_unb_var1 )
GENPROT( packm_blk_var1 )
//
// Prototype BLAS-like interfaces with void pointer operands.
@@ -108,6 +106,5 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( packm_blk_var1sl )
INSERT_GENTPROT_BASIC0( packm_blk_var1rr )
INSERT_GENTPROT_BASIC0( packm_blk_var1 )

View File

@@ -39,26 +39,32 @@
// gemm
// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
// herk
// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
// trmm
// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
#define bli_trmm_my_iter( index, thread ) \
#define bli_trmm_my_iter_rr( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )
// trsm
#define bli_trsm_my_iter( index, thread ) \
#define bli_trsm_my_iter_rr( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )

View File

@@ -58,30 +58,15 @@ cntl_t* bli_gemmbp_cntl_create
void* packa_fp;
void* packb_fp;
#ifdef BLIS_ENABLE_JRIR_SLAB
// Use the function pointers to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2sl;
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2sl;
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2sl;
else macro_kernel_fp = NULL;
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2;
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
else /* should never execute */ macro_kernel_fp = NULL;
packa_fp = bli_packm_blk_var1sl;
packb_fp = bli_packm_blk_var1sl;
#else // BLIS_ENABLE_JRIR_RR
// Use the function pointers to the macrokernels that use round-robin
// assignment of micropanels to threads in the jr and ir loops.
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2rr;
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2rr;
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2rr;
else macro_kernel_fp = NULL;
packa_fp = bli_packm_blk_var1rr;
packb_fp = bli_packm_blk_var1rr;
#endif
packa_fp = bli_packm_blk_var1;
packb_fp = bli_packm_blk_var1;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node

View File

@@ -116,8 +116,7 @@ void bli_gemm_int
if ( im != BLIS_NAT )
{
if ( im == BLIS_4M1B )
if ( f == bli_gemm_ker_var2sl ||
f == bli_gemm_ker_var2rr ) f = bli_gemm4mb_ker_var2;
if ( f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
}
}

View File

@@ -0,0 +1,379 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
void bli_gemm_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
(
dt_exec,
schema_a,
c,
m, n, k,
pd_a, ps_a,
pd_b, ps_b,
rs_c, cs_c
);
}
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for the 2nd and 1st loops.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )

View File

@@ -60,8 +60,7 @@ GENPROT( gemm_packb )
GENPROT( gemm_ker_var1 )
GENPROT( gemm_ker_var2sl )
GENPROT( gemm_ker_var2rr )
GENPROT( gemm_ker_var2 )
// Headers for induced algorithms:
GENPROT( gemm4mb_ker_var2 ) // 4m1b
@@ -93,8 +92,7 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
// Headers for induced algorithms:
INSERT_GENTPROT_BASIC0( gemm4mb_ker_var2 ) // 4m1b

View File

@@ -0,0 +1,555 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
void bli_herk_l_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, ip; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely above the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region above where the diagonal of C intersects
the left edge of the panel, adjust the pointer to C and A and treat
this case as if the diagonal offset were zero. */ \
if ( diagoffc < 0 ) \
{ \
ip = -diagoffc / MR; \
i = ip * MR; \
m = m - i; \
diagoffc = -diagoffc % MR; \
c_cast = c_cast + (i )*rs_c; \
a_cast = a_cast + (ip )*ps_a; \
} \
\
/* If there is a zero region to the right of where the diagonal
of C intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffc + m < n ) \
{ \
n = diagoffc + m; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of C, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. Any remainder from this integer division is discarded, which
is what we want. That is, we want the rectangular region to contain
as many columns of whole microtiles as possible without including any
microtiles that intersect the diagonal. The number of iterations in
the triangular (or trapezoidal) region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_rct = diagoffc / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Determine the thread range and increment for the 2nd and 1st loops for
the initial rectangular region of C (if it exists).
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
and the default (slab or rr) partitioning in the 1st loop for the
remaining triangular region of C. */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the triangular region
by the number of iterations used for the rectangular region. */ \
jr_start += n_iter_rct; \
jr_end += n_iter_rct; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )

View File

@@ -0,0 +1,558 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
void bli_herk_u_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, jp; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely below the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region to the left of where the diagonal of C
intersects the top edge of the panel, adjust the pointer to C and B
and treat this case as if the diagonal offset were zero.
NOTE: It's possible that after this pruning that the diagonal offset
is still positive (though it is guaranteed to be less than NR). */ \
if ( diagoffc > 0 ) \
{ \
jp = diagoffc / NR; \
j = jp * NR; \
n = n - j; \
diagoffc = diagoffc % NR; \
c_cast = c_cast + (j )*cs_c; \
b_cast = b_cast + (jp )*ps_b; \
} \
\
/* If there is a zero region below where the diagonal of C intersects
the right edge of the panel, shrink it to prevent "no-op" iterations
from executing. */ \
if ( -diagoffc + n < m ) \
{ \
m = -diagoffc + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in C. A non-zero remainder means we need to
add one additional iteration. That is, we want the triangular region
to contain as few columns of whole microtiles as possible while still
including all microtiles that intersect the diagonal. The number of
iterations in the rectangular region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
and the default (slab or rr) partitioning in the 1st loop for the
initial triangular region of C (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Determine the thread range and increment for the 2nd loop of the
remaining rectangular region of C (and also use default partitioning
for the 1st loop).
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )

View File

@@ -56,13 +56,10 @@ void PASTEMAC0(opname) \
//GENPROT( herk_blk_var2 )
//GENPROT( herk_blk_var3 )
GENPROT( herk_x_ker_var2sl )
GENPROT( herk_x_ker_var2rr )
GENPROT( herk_x_ker_var2 )
GENPROT( herk_l_ker_var2sl )
GENPROT( herk_l_ker_var2rr )
GENPROT( herk_u_ker_var2sl )
GENPROT( herk_u_ker_var2rr )
GENPROT( herk_l_ker_var2 )
GENPROT( herk_u_ker_var2 )
//GENPROT( herk_packa )
//GENPROT( herk_packb )
@@ -94,8 +91,6 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( herk_l_ker_var2sl )
INSERT_GENTPROT_BASIC0( herk_l_ker_var2rr )
INSERT_GENTPROT_BASIC0( herk_u_ker_var2sl )
INSERT_GENTPROT_BASIC0( herk_u_ker_var2rr )
INSERT_GENTPROT_BASIC0( herk_l_ker_var2 )
INSERT_GENTPROT_BASIC0( herk_u_ker_var2 )

View File

@@ -35,12 +35,12 @@
#include "blis.h"
static gemm_var_oft vars_sl[2] =
static gemm_var_oft vars[2] =
{
bli_herk_l_ker_var2sl, bli_herk_u_ker_var2sl,
bli_herk_l_ker_var2, bli_herk_u_ker_var2,
};
void bli_herk_x_ker_var2sl
void bli_herk_x_ker_var2
(
obj_t* a,
obj_t* ah,
@@ -59,48 +59,7 @@ void bli_herk_x_ker_var2sl
else uplo = 1;
// Index into the variant array to extract the correct function pointer.
f = vars_sl[uplo];
// Call the macrokernel.
f
(
a,
ah,
c,
cntx,
rntm,
cntl,
thread
);
}
// -----------------------------------------------------------------------------
static gemm_var_oft vars_rr[2] =
{
bli_herk_l_ker_var2rr, bli_herk_u_ker_var2rr,
};
void bli_herk_x_ker_var2rr
(
obj_t* a,
obj_t* ah,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bool_t uplo;
gemm_var_oft f;
// Set a bool based on the uplo field of C's root object.
if ( bli_obj_root_is_lower( c ) ) uplo = 0;
else uplo = 1;
// Index into the variant array to extract the correct function pointer.
f = vars_rr[uplo];
f = vars[uplo];
// Call the macrokernel.
f

View File

@@ -0,0 +1,533 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
void bli_trmm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )

View File

@@ -0,0 +1,541 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
void bli_trmm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely below the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly above the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )

View File

@@ -0,0 +1,598 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
void bli_trmm_rl_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of B, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. (There should never be any remainder in this division.) The
number of iterations in the triangular (or trapezoidal) region is
computed as the remaining number of iterations in the n dimension. */ \
n_iter_rct = diagoffb / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Determine the thread range and increment for the 2nd and 1st loops for
the initial rectangular region of B (if it exists).
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and
1st loops for the remaining triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
/* Advance the starting b1 and c1 pointers to the positions corresponding
to the start of the triangular region of B. */ \
jr_start = n_iter_rct; \
b1 = b_cast + jr_start * cstep_b; \
c1 = c_cast + jr_start * cstep_c; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )

View File

@@ -0,0 +1,618 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
void bli_trmm_ru_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j, jb0; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in B. (There should never be any remainder
in this division.) The number of iterations in the rectangular region
is computed as the remaining number of iterations in the n dimension. */ \
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and
1st loops for the initial triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter_tri; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Determine the thread range and increment for the 2nd and 1st loops for
the remaining rectangular region of B.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
jb0 = n_iter_tri; \
\
/* Save the resulting value of b1 from the previous loop since it represents
the starting point for the rectangular region. */ \
b_cast = b1; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
/* NOTE: We must index through b_cast differently since it contains
the starting address of the rectangular region (which is already
n_iter_tri logical iterations through B). */ \
b1 = b_cast + (j-jb0) * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
\
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )

View File

@@ -56,17 +56,12 @@ void PASTEMAC0(opname) \
//GENPROT( trmm_blk_var2 )
//GENPROT( trmm_blk_var3 )
GENPROT( trmm_xx_ker_var2sl )
GENPROT( trmm_xx_ker_var2rr )
GENPROT( trmm_xx_ker_var2 )
GENPROT( trmm_ll_ker_var2sl )
GENPROT( trmm_ll_ker_var2rr )
GENPROT( trmm_lu_ker_var2sl )
GENPROT( trmm_lu_ker_var2rr )
GENPROT( trmm_rl_ker_var2sl )
GENPROT( trmm_rl_ker_var2rr )
GENPROT( trmm_ru_ker_var2sl )
GENPROT( trmm_ru_ker_var2rr )
GENPROT( trmm_ll_ker_var2 )
GENPROT( trmm_lu_ker_var2 )
GENPROT( trmm_rl_ker_var2 )
GENPROT( trmm_ru_ker_var2 )
//
@@ -96,12 +91,8 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2rr )
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2rr )
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2rr )
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2rr )
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 )

View File

@@ -35,13 +35,13 @@
#include "blis.h"
static gemm_var_oft vars_sl[2][2] =
static gemm_var_oft vars[2][2] =
{
{ bli_trmm_ll_ker_var2sl, bli_trmm_lu_ker_var2sl },
{ bli_trmm_rl_ker_var2sl, bli_trmm_ru_ker_var2sl }
{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
};
void bli_trmm_xx_ker_var2sl
void bli_trmm_xx_ker_var2
(
obj_t* a,
obj_t* b,
@@ -73,62 +73,7 @@ void bli_trmm_xx_ker_var2sl
}
// Index into the variant array to extract the correct function pointer.
f = vars_sl[side][uplo];
// Call the macrokernel.
f
(
a,
b,
c,
cntx,
rntm,
cntl,
thread
);
}
// -----------------------------------------------------------------------------
static gemm_var_oft vars_rr[2][2] =
{
{ bli_trmm_ll_ker_var2rr, bli_trmm_lu_ker_var2rr },
{ bli_trmm_rl_ker_var2rr, bli_trmm_ru_ker_var2rr }
};
void bli_trmm_xx_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bool_t side;
bool_t uplo;
gemm_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular
// matrix's root object (whether that is matrix A or matrix B).
if ( bli_obj_root_is_triangular( a ) )
{
side = 0;
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
else uplo = 1;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
side = 1;
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
else uplo = 1;
}
// Index into the variant array to extract the correct function pointer.
f = vars_rr[side][uplo];
f = vars[side][uplo];
// Call the macrokernel.
f

View File

@@ -58,24 +58,12 @@ cntl_t* bli_trsm_l_cntl_create
void* packa_fp;
void* packb_fp;
#ifdef BLIS_ENABLE_JRIR_SLAB
// Use the function pointer to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_p = bli_trsm_xx_ker_var2sl;
macro_kernel_p = bli_trsm_xx_ker_var2;
packa_fp = bli_packm_blk_var1sl;
packb_fp = bli_packm_blk_var1sl;
#else // BLIS_ENABLE_JRIR_RR
// Use the function pointer to the macrokernels that use round-robin
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_p = bli_trsm_xx_ker_var2rr;
packa_fp = bli_packm_blk_var1rr;
packb_fp = bli_packm_blk_var1rr;
#endif
packa_fp = bli_packm_blk_var1;
packb_fp = bli_packm_blk_var1;
const opid_t family = BLIS_TRSM;
@@ -162,16 +150,11 @@ cntl_t* bli_trsm_r_cntl_create
pack_t schema_b
)
{
// trsm macrokernels are presently disabled for right-side execution,
// so it doesn't matter which function pointer we use here (sl or rr).
// To be safe, we'll insert an abort() guard to alert the developers
// of this should right-side macrokernels ever be re-enabled.
void* macro_kernel_p = bli_trsm_xx_ker_var2sl;
// NOTE: trsm macrokernels are presently disabled for right-side execution.
void* macro_kernel_p = bli_trsm_xx_ker_var2;
void* packa_fp = bli_packm_blk_var1sl;
void* packb_fp = bli_packm_blk_var1sl;
bli_abort();
void* packa_fp = bli_packm_blk_var1;
void* packb_fp = bli_packm_blk_var1;
const opid_t family = BLIS_TRSM;

View File

@@ -0,0 +1,604 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
void bli_trsm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
dim_t off_a11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a10 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a10 = k_a1011 - MR; \
off_a11 = k_a10; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
*/ \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
( double* )c, 1, cs_c, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )

View File

@@ -0,0 +1,585 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
void bli_trsm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
dim_t off_a11; \
dim_t off_a12; \
dim_t i, j, ib; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a11 = diagoffa_i; \
k_a1112 = k - off_a11;; \
k_a11 = MR; \
k_a12 = k_a1112 - MR; \
off_a12 = off_a11 + k_a11; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 -= rstep_c; \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
printf( "m_iter = %lu\n", m_iter ); \
printf( "m_cur = %lu\n", m_cur ); \
printf( "k = %lu\n", k ); \
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
printf( "off_a1112 = %lu\n", off_a1112 ); \
printf( "k_a1112 = %lu\n", k_a1112 ); \
printf( "k_a12 = %lu\n", k_a12 ); \
printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )

View File

@@ -428,7 +428,7 @@ void PASTEMAC(ch,varname) \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a11; \
ctype* restrict a12; \
@@ -514,7 +514,7 @@ void PASTEMAC(ch,varname) \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a2; \
\

View File

@@ -421,7 +421,7 @@ void PASTEMAC(ch,varname) \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a10; \
ctype* restrict a11; \
@@ -507,7 +507,7 @@ void PASTEMAC(ch,varname) \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a2; \
\

View File

@@ -58,14 +58,10 @@ GENPROT( trsm_blk_var3 )
GENPROT( trsm_packa )
GENPROT( trsm_packb )
GENPROT( trsm_xx_ker_var2sl )
GENPROT( trsm_xx_ker_var2rr )
GENPROT( trsm_ll_ker_var2sl )
GENPROT( trsm_ll_ker_var2rr )
GENPROT( trsm_lu_ker_var2sl )
GENPROT( trsm_lu_ker_var2rr )
GENPROT( trsm_xx_ker_var2 )
GENPROT( trsm_ll_ker_var2 )
GENPROT( trsm_lu_ker_var2 )
GENPROT( trsm_rl_ker_var2 )
GENPROT( trsm_ru_ker_var2 )
@@ -97,11 +93,8 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2sl )
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2rr )
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2sl )
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2rr )
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )

View File

@@ -35,13 +35,13 @@
#include "blis.h"
static trsm_var_oft vars_sl[2][2] =
static trsm_var_oft vars[2][2] =
{
{ bli_trsm_ll_ker_var2sl, bli_trsm_lu_ker_var2sl },
{ bli_trsm_rl_ker_var2 , bli_trsm_ru_ker_var2 }
{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
};
void bli_trsm_xx_ker_var2sl
void bli_trsm_xx_ker_var2
(
obj_t* a,
obj_t* b,
@@ -73,62 +73,7 @@ void bli_trsm_xx_ker_var2sl
}
// Index into the variant array to extract the correct function pointer.
f = vars_sl[side][uplo];
// Call the macrokernel.
f
(
a,
b,
c,
cntx,
rntm,
cntl,
thread
);
}
// -----------------------------------------------------------------------------
static trsm_var_oft vars_rr[2][2] =
{
{ bli_trsm_ll_ker_var2rr, bli_trsm_lu_ker_var2rr },
{ bli_trsm_rl_ker_var2 , bli_trsm_ru_ker_var2 }
};
void bli_trsm_xx_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bool_t side;
bool_t uplo;
trsm_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular
// matrix's root object (whether that is matrix A or matrix B).
if ( bli_obj_root_is_triangular( a ) )
{
side = 0;
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
else uplo = 1;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
side = 1;
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
else uplo = 1;
}
// Index into the variant array to extract the correct function pointer.
f = vars_rr[side][uplo];
f = vars[side][uplo];
// Call the macrokernel.
f

View File

@@ -804,6 +804,15 @@ static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth
( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
}
static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
{
#ifdef BLIS_ENABLE_JRIR_SLAB
return bli_is_last_iter_sl( i, end_iter, tid, nth );
#else // BLIS_ENABLE_JRIR_RR
return bli_is_last_iter_rr( i, end_iter, tid, nth );
#endif
}
// packbuf_t-related

View File

@@ -59,32 +59,6 @@ void bli_thread_finalize( void )
{
}
// -----------------------------------------------------------------------------
#if 0
void bli_thread_range_jrir
(
thrinfo_t* thread,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
)
{
//#ifdef BLIS_JRIR_INTERLEAVE
#if 1
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = bli_thread_n_way( thread );
*end = n;
#else
// Use contiguous slab partitioning for jr/ir loops.
bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
*inc = 1;
#endif
}
#endif
// -----------------------------------------------------------------------------
void bli_thread_range_sub

View File

@@ -57,19 +57,6 @@ void bli_thread_finalize( void );
#endif
// Thread range-related prototypes.
#if 0
void bli_thread_range_jrir
(
thrinfo_t* thread,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
);
#endif
// -----------------------------------------------------------------------------
void bli_thread_range_sub
(
@@ -261,7 +248,6 @@ static void bli_thread_range_jrir_sl
*inc = 1;
}
#if 0
static void bli_thread_range_jrir
(
thrinfo_t* thread,
@@ -273,6 +259,9 @@ static void bli_thread_range_jrir
dim_t* inc
)
{
// Define a general-purpose version of bli_thread_range_jrir() whose
// definition depends on whether slab or round-robin partitioning was
// requested at configure-time.
#ifdef BLIS_ENABLE_JRIR_SLAB
bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
#else
@@ -280,6 +269,7 @@ static void bli_thread_range_jrir
#endif
}
#if 0
static void bli_thread_range_weighted_jrir
(
thrinfo_t* thread,

View File

@@ -59,24 +59,10 @@ cntl_t* blx_gemmbp_cntl_create
void* packa_fp;
void* packb_fp;
#ifdef BLIS_ENABLE_JRIR_SLAB
macro_kernel_fp = blx_gemm_ker_var2;
// Use the function pointers to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_fp = blx_gemm_ker_var2sl;
packa_fp = bli_packm_blk_var1sl;
packb_fp = bli_packm_blk_var1sl;
#else // BLIS_ENABLE_JRIR_RR
// Use the function pointers to the macrokernels that use round-robin
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_fp = bli_gemm_ker_var2rr;
packa_fp = bli_packm_blk_var1rr;
packb_fp = bli_packm_blk_var1rr;
#endif
packa_fp = bli_packm_blk_var1;
packb_fp = bli_packm_blk_var1;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node

View File

@@ -0,0 +1,375 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "blix.h"
// Function pointer type for datatype-specific functions.
typedef void (*gemm_fp)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
// Function pointer array for datatype-specific functions.
static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
{
PASTECH2(blx_,s,gemm_ker_var2),
PASTECH2(blx_,c,gemm_ker_var2),
PASTECH2(blx_,d,gemm_ker_var2),
PASTECH2(blx_,z,gemm_ker_var2)
};
void blx_gemm_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
gemm_fp f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(blx_,ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for the 2nd and 1st loops.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
#if 0
GENTFUNC( float, s, gemm_ker_var2 )
GENTFUNC( double, d, gemm_ker_var2 )
GENTFUNC( scomplex, c, gemm_ker_var2 )
GENTFUNC( dcomplex, z, gemm_ker_var2 )
#else
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
#endif

View File

@@ -58,8 +58,7 @@ GENPROT( gemm_blk_var3 )
GENPROT( gemm_packa )
GENPROT( gemm_packb )
GENPROT( gemm_ker_var2sl )
GENPROT( gemm_ker_var2rr )
GENPROT( gemm_ker_var2 )
//
// Prototype BLAS-like interfaces with void pointer operands.
@@ -87,6 +86,5 @@ void PASTECH2(blx_,ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )