Added option of slab or rr partitioning in jr/ir.

Details:
- Updated existing macrokernel function names and definitions to
  explicitly use slab assignment of micropanels to threads, then created
  duplicate versions of macrokernels that explicitly use round-robin
  assignment instead of slab. NOTE: As in ac18949, trsm_r macrokernels
  were not substantially updated in this commit because they are
  currently disabled in bli_trsm_front.c.
- Updated existing packing function (in blk_packm_blk_var1.c) to
  explicitly use slab partitioning, and then duplicated for round-robin.
- Updated control tree initialization to use the appropriate macrokernel
  and packm function pointers depending on which method (slab or rr) was
  enabled at configure-time.
- Updated configure script to accept new --thread-part-jrir=[slab|rr]
  option (-m [slab|rr] for short), which allows the user to explicitly
  request either slab or round-robin assignment (partitioning) of
  micropanels to threads.
- Updated sandbox/ref99 according to above changes.
- Minor updates to build/add-copyright.py.
This commit is contained in:
Field G. Van Zee
2018-10-07 20:30:32 -05:00
parent 98e01ea04b
commit c92762ecdc
48 changed files with 7477 additions and 1394 deletions

View File

@@ -187,6 +187,8 @@ def main():
else:
filename = git_words[1]
#my_echo( "-debug---- %s" % filename )
# Start by opening the file. (We can assume it exists since it
# was found by 'git status', so no need to check for existence.)
# Read all lines in the file and then close it.
@@ -203,7 +205,7 @@ def main():
# If the file does not have any copyright notice in it already, we
# assume we don't need to update it.
if not has_cr:
my_echo( "[skipped] %s" % filename )
my_echo( "[nocrline] %s" % filename )
continue
# Check whether the file already has a copyright for the_org. We may
@@ -214,7 +216,7 @@ def main():
mod_file_lines = []
# At this point we know that the file has at least one copyright, and
# has_org_cr encodes whether already has a copyright for the_org.
# has_org_cr encodes whether it already has a copyright for the_org.
# We process the files that we know already have copyrights for the_org
# differently from the files that do not yet have them.
@@ -240,12 +242,15 @@ def main():
repl_line = ' %s, ' % cur_year
line_ny = re.sub( find_line, repl_line, line )
my_echo( "[updated] %s" % filename )
my_echo( "[updated ] %s" % filename )
# Add the updated line to the running list.
mod_file_lines += line_ny
else:
my_echo( "[up2date ] %s" % filename )
# Add the unchanged line to the running list.
mod_file_lines += line
@@ -262,7 +267,7 @@ def main():
# Don't go any further if we're only updating existing copyright
# lines.
if update_only:
my_echo( "[skipped] %s" % filename )
my_echo( "[nocrline] %s" % filename )
continue
num_file_lines = len( file_lines )
@@ -313,7 +318,7 @@ def main():
mod_file_lines += line
mod_file_lines += line_nyno
my_echo( "[added ] %s" % filename )
my_echo( "[added ] %s" % filename )
# endif resnext

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -52,6 +53,14 @@
#define BLIS_ENABLE_PTHREADS
#endif
#if @enable_jrir_slab@
#define BLIS_ENABLE_JRIR_SLAB
#endif
#if @enable_jrir_rr@
#define BLIS_ENABLE_JRIR_RR
#endif
#if @enable_packbuf_pools@
#define BLIS_ENABLE_PACKBUF_POOLS
#endif

57
configure vendored
View File

@@ -163,9 +163,6 @@ print_usage()
echo " incur additional overhead in some (but not all)"
echo " situations."
echo " "
echo " -q, --quiet Suppress informational output. By default, configure"
echo " is verbose. (NOTE: -q is not yet implemented)"
echo " "
echo " -i SIZE, --int-size=SIZE"
echo " "
echo " Set the size (in bits) of internal BLIS integers and"
@@ -212,6 +209,19 @@ print_usage()
echo " detects the presence of libmemkind, libmemkind is used"
echo " by default, and otherwise it is not used by default."
echo " "
echo " --thread-part-jrir=METHOD"
echo " "
echo " Request a method of assigning micropanels to threads in"
echo " the JR and IR loops. Valid options are 'slab' and 'rr'."
echo " Using 'slab' assigns (as much as possible) contiguous"
echo " regions of micropanels to each thread while the latter"
echo " assigns micropanels to threads in a round-robin fashion."
echo " (NOTE: Specifying this option constitutes a *request*,"
echo " which may be ignored in certain situations if the"
echo " implementation has a good reason to do so.) The chosen"
echo " method also applies during the packing of A and B. The"
echo " default method and B. The default method is 'slab'."
echo " "
echo " --force-version=STRING"
echo " "
echo " Force configure to use an arbitrary version string"
@@ -226,6 +236,9 @@ print_usage()
echo " a sanity check to make sure these lists are constituted"
echo " as expected."
echo " "
echo " -q, --quiet Suppress informational output. By default, configure"
echo " is verbose. (NOTE: -q is not yet implemented)"
echo " "
echo " -h, --help Output this information and quit."
echo " "
echo " Environment Variables:"
@@ -1591,6 +1604,9 @@ main()
# The threading flag.
threading_model='no'
# The method of assigning micropanels to threads in the JR and JR loops.
thread_part_jrir='slab'
# Option variables.
quiet_flag=''
show_config_list=''
@@ -1641,7 +1657,7 @@ main()
# -- Command line option/argument parsing ----------------------------------
# Process our command line options.
while getopts ":hp:d:s:t:qci:b:-:" opt; do
while getopts ":hp:d:s:t:r:qci:b:-:" opt; do
case $opt in
-)
case "$OPTARG" in
@@ -1705,6 +1721,9 @@ main()
enable-threading=*)
threading_model=${OPTARG#*=}
;;
thread-part-jrir=*)
thread_part_jrir=${OPTARG#*=}
;;
disable-threading)
threading_model='no'
;;
@@ -1776,6 +1795,9 @@ main()
t)
threading_model=$OPTARG
;;
r)
thread_part_jrir=$OPTARG
;;
i)
int_type_size=$OPTARG
;;
@@ -1791,7 +1813,7 @@ main()
esac
done
shift $(($OPTIND - 1))
# Parse environment variables
while [ $# -gt 0 ]; do
case $1 in
@@ -2351,7 +2373,7 @@ main()
elif [ "x${threading_model}" = "xpthreads" ] ||
[ "x${threading_model}" = "xpthread" ] ||
[ "x${threading_model}" = "xposix" ]; then
echo "${script_name}: using Pthreads for threading."
echo "${script_name}: using POSIX threads for threading."
enable_pthreads='yes'
enable_pthreads_01=1
threading_model="pthreads" # Standardize the value.
@@ -2362,7 +2384,22 @@ main()
echo "${script_name}: *** Unsupported threading model: ${threading_model}."
exit 1
fi
# Check the method of assigning micropanels to threads in the JR and IR
# loops.
enable_jrir_slab_01=0
enable_jrir_rr_01=0
if [ "x${thread_part_jrir}" = "xslab" ]; then
echo "${script_name}: requesting slab threading in jr and ir loops."
enable_jrir_slab_01=1
elif [ "x${thread_part_jrir}" = "xrr" ]; then
echo "${script_name}: requesting round-robin threading in jr and ir loops."
enable_jrir_rr_01=1
else
echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${threading_model}."
exit 1
fi
# Convert 'yes' and 'no' flags to booleans.
if [ "x${enable_packbuf_pools}" = "xyes" ]; then
echo "${script_name}: internal memory pools for packing buffers are enabled."
@@ -2414,7 +2451,7 @@ main()
echo "${script_name}: the CBLAS compatibility layer is disabled."
enable_cblas_01=0
fi
# Report integer sizes
if [ "x${int_type_size}" = "x32" ]; then
echo "${script_name}: the internal integer size is 32-bit."
@@ -2590,6 +2627,8 @@ main()
| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
| sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \
| sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
| sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
| sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
| sed -e "s/@enable_packbuf_pools@/${enable_packbuf_pools_01}/g" \
| sed -e "s/@int_type_size@/${int_type_size}/g" \
| sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
@@ -2681,7 +2720,7 @@ main()
# -- Mirror source directory hierarchies to object directories -------------
# Combine the config_list with the config_name and then remove duplicates.
config_list_plus_name=$(rm_duplicate_words "${config_list} ${config_name}")

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -39,9 +40,7 @@
#include "bli_packm_part.h"
#include "bli_packm_unb_var1.h"
#include "bli_packm_blk_var1.h"
#include "bli_packm_var.h"
#include "bli_packm_struc_cxk.h"
#include "bli_packm_struc_cxk_4mi.h"

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,71 +35,6 @@
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 4m interleaved
{ { NULL, bli_cpackm_struc_cxk_4mi,
NULL, bli_zpackm_struc_cxk_4mi, } },
// 0010 row/col panels: 3m interleaved
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
{ { NULL, NULL,
NULL, NULL, } },
// 0100 row/col panels: 3m separated
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0101 row/col panels: real only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0110 row/col panels: imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
void bli_packm_blk_var1
(
obj_t* c,
@@ -108,619 +44,14 @@ void bli_packm_blk_var1
thrinfo_t* t
)
{
num_t dt_cp = bli_obj_dt( c );
#ifdef BLIS_ENABLE_JRIR_SLAB
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool_t invdiag = bli_obj_has_inverted_diag( p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
bli_packm_blk_var1sl( c, p, cntx, cntl, t );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
#else // BLIS_ENABLE_JRIR_RR
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
bli_packm_blk_var1rr( c, p, cntx, cntl, t );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
}
else // if ( bli_is_ind_packed( schema ) )
{
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
//printf( "applying non-zero imag kappa\n" );
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = κ
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
}
// Choose the correct func_t object based on the pack_t schema.
#if 0
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
#else
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
// currently is not.
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
{
// If the packm structure-aware kernel func_t in the context is
// NULL (which is the default value after the context is created),
// we use the default lookup table to determine the right func_t
// for the current schema.
const dim_t i = bli_pack_schema_index( schema );
packm_kers = &packm_struc_cxk_kers[ i ];
}
#if 0
else // cntx's packm func_t overrides
{
// If the packm structure-aware kernel func_t in the context is
// non-NULL (ie: assumed to be valid), we use that instead.
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
#endif
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
cntx,
t );
}
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t num_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (num_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = num_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/ \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
*/ \
\
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/ \
/*
*/ \
\
/*
*/ \
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/ \
\
\
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
\
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
p_begin += p_inc; \
\
} \
}
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )

View File

@@ -1,463 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
packm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
extern func_t* packm_struc_cxk_kers;
void bli_packm_blk_var1( obj_t* c,
obj_t* p,
packm_thrinfo_t* t )
{
num_t dt_cp = bli_obj_dt( c );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool_t invdiag = bli_obj_has_inverted_diag( p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// This variant assumes that the micro-kernel will always apply the
// alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
// for kappa so that the underlying packm implementation does not
// scale during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
// Choose the correct func_t object.
packm_kers = packm_struc_cxk_kers;
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_obj_query( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
t );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kertype ) \
\
void PASTEMAC(ch,varname) \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
packm_thrinfo_t* thread \
) \
{ \
PASTECH(ch,kertype) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t num_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (num_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = num_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ldp * panel_len_max_i; \
\
/* We nudge the panel increment up by one if it is odd. */ \
p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, \
is_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
/*p_inc = ldp * panel_len_max_i;*/ \
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p, \
is_p ); \
} \
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
else if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
*/ \
\
/* NOTE: This value is equivalent to ps_p. */ \
/*p_inc = ldp * panel_len_max_i;*/ \
p_inc = ps_p; \
} \
\
\
p_begin += p_inc; \
} \
}
INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t )

View File

@@ -0,0 +1,737 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)
(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1rr);
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 4m interleaved
{ { NULL, bli_cpackm_struc_cxk_4mi,
NULL, bli_zpackm_struc_cxk_4mi, } },
// 0010 row/col panels: 3m interleaved
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
{ { NULL, NULL,
NULL, NULL, } },
// 0100 row/col panels: 3m separated
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0101 row/col panels: real only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0110 row/col panels: imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
void bli_packm_blk_var1rr
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* t
)
{
num_t dt_cp = bli_obj_dt( c );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool_t invdiag = bli_obj_has_inverted_diag( p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
}
else // if ( bli_is_ind_packed( schema ) )
{
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
//printf( "applying non-zero imag kappa\n" );
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
}
// Choose the correct func_t object based on the pack_t schema.
#if 0
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
#else
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
// currently is not.
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
{
// If the packm structure-aware kernel func_t in the context is
// NULL (which is the default value after the context is created),
// we use the default lookup table to determine the right func_t
// for the current schema.
const dim_t i = bli_pack_schema_index( schema );
packm_kers = &packm_struc_cxk_kers[ i ];
}
#if 0
else // cntx's packm func_t overrides
{
// If the packm structure-aware kernel func_t in the context is
// non-NULL (ie: assumed to be valid), we use that instead.
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
#endif
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
cntx,
t );
}
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (n_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = n_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
/*
printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
*/ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
} \
/*
printf( "thread %d: done\n", (int)tid ); \
*/ \
}
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1rr )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/

View File

@@ -0,0 +1,737 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)
(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool_t invdiag,
bool_t revifup,
bool_t reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void* packm_ker,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1sl);
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 4m interleaved
{ { NULL, bli_cpackm_struc_cxk_4mi,
NULL, bli_zpackm_struc_cxk_4mi, } },
// 0010 row/col panels: 3m interleaved
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
{ { NULL, NULL,
NULL, NULL, } },
// 0100 row/col panels: 3m separated
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0101 row/col panels: real only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0110 row/col panels: imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
void bli_packm_blk_var1sl
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* t
)
{
num_t dt_cp = bli_obj_dt( c );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool_t invdiag = bli_obj_has_inverted_diag( p );
bool_t revifup = bli_obj_is_pack_rev_if_upper( p );
bool_t reviflo = bli_obj_is_pack_rev_if_lower( p );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
obj_t kappa;
obj_t* kappa_p;
void* buf_kappa;
func_t* packm_kers;
void* packm_ker;
FUNCPTR_T f;
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
}
else // if ( bli_is_ind_packed( schema ) )
{
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
//printf( "applying non-zero imag kappa\n" );
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, kappa_p );
}
// Choose the correct func_t object based on the pack_t schema.
#if 0
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
#else
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
// currently is not.
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
//if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) )
{
// If the packm structure-aware kernel func_t in the context is
// NULL (which is the default value after the context is created),
// we use the default lookup table to determine the right func_t
// for the current schema.
const dim_t i = bli_pack_schema_index( schema );
packm_kers = &packm_struc_cxk_kers[ i ];
}
#if 0
else // cntx's packm func_t overrides
{
// If the packm structure-aware kernel func_t in the context is
// non-NULL (ie: assumed to be valid), we use that instead.
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
#endif
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_get_dt( dt_cp, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
cntx,
t );
}
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool_t row_stored; \
bool_t col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (n_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = n_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
if ( bli_packm_my_iter_sl( it, it_start, it_end, tid, nt ) ) \
{ \
/*
printf( "thread %d: packing micropanel iteration %3d\n", (int)tid, (int)it ); \
*/ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
} \
/*
printf( "thread %d: done\n", (int)tid ); \
*/ \
}
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1sl )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -36,7 +37,22 @@
// thrinfo_t macros specific to packm.
//
#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
/*
#define bli_packm_thread_my_iter( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )
*/
#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \
\
( i % n_way == work_id % n_way )
#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \
\
( start <= i && i < end )
//
// thrinfo_t APIs specific to packm.

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -32,15 +33,52 @@
*/
void bli_packm_blk_var1
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* t
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* c, \
obj_t* p, \
cntx_t* cntx, \
cntl_t* cntl, \
thrinfo_t* t \
);
GENPROT( packm_unb_var1 )
GENPROT( packm_blk_var1 )
GENPROT( packm_blk_var1sl )
GENPROT( packm_blk_var1rr )
//
// Prototype BLAS-like interfaces with void pointer operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( packm_unb_var1 )
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
@@ -70,5 +108,6 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( packm_blk_var1 )
INSERT_GENTPROT_BASIC0( packm_blk_var1sl )
INSERT_GENTPROT_BASIC0( packm_blk_var1rr )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -53,11 +54,34 @@ cntl_t* bli_gemmbp_cntl_create
pack_t schema_b
)
{
void* macro_kernel_p = bli_gemm_ker_var2;
void* macro_kernel_fp;
void* packa_fp;
void* packb_fp;
// Change the macro-kernel if the operation family is herk or trmm.
if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
#ifdef BLIS_ENABLE_JRIR_SLAB
// Use the function pointers to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2sl;
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2sl;
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2sl;
else macro_kernel_fp = NULL;
packa_fp = bli_packm_blk_var1sl;
packb_fp = bli_packm_blk_var1sl;
#else // BLIS_ENABLE_JRIR_RR
// Use the function pointers to the macrokernels that use round-robin
// assignment of micropanels to threads in the jr and ir loops.
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2rr;
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2rr;
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2rr;
else macro_kernel_fp = NULL;
packa_fp = bli_packm_blk_var1rr;
packb_fp = bli_packm_blk_var1rr;
#endif
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
@@ -72,7 +96,7 @@ cntl_t* bli_gemmbp_cntl_create
(
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
macro_kernel_fp,
gemm_cntl_bu_ke
);
@@ -80,7 +104,7 @@ cntl_t* bli_gemmbp_cntl_create
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
packa_fp,
BLIS_MR,
BLIS_KR,
FALSE, // do NOT invert diagonal
@@ -104,7 +128,7 @@ cntl_t* bli_gemmbp_cntl_create
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
(
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
packb_fp,
BLIS_KR,
BLIS_NR,
FALSE, // do NOT invert diagonal

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -114,7 +115,9 @@ void bli_gemm_int
if ( im != BLIS_NAT )
{
if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
if ( im == BLIS_4M1B )
if ( f == bli_gemm_ker_var2sl ||
f == bli_gemm_ker_var2rr ) f = bli_gemm4mb_ker_var2;
}
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -32,6 +33,8 @@
*/
#if 0
#include "blis.h"
void bli_gemm_ker_var1
@@ -55,3 +58,5 @@ void bli_gemm_ker_var1
bli_gemm_ker_var2( b, a, c, cntx, rntm, cntl, thread );
}
#endif

View File

@@ -56,10 +56,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_gemm_ker_var2
void bli_gemm_ker_var2rr
(
obj_t* a,
obj_t* b,
@@ -117,7 +120,6 @@ void bli_gemm_ker_var2
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
#if 1
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
@@ -131,7 +133,6 @@ void bli_gemm_ker_var2
rs_c, cs_c
);
}
#endif
// Index into the type combination array to extract the correct
// function pointer.
@@ -285,8 +286,8 @@ void PASTEMAC(ch,varname) \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
@@ -315,11 +316,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -369,11 +370,11 @@ void PASTEMAC(ch,varname) \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )

View File

@@ -0,0 +1,380 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_gemm_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
(
dt_exec,
schema_a,
c,
m, n, k,
pd_a, ps_a,
pd_b, ps_b,
rs_c, cs_c
);
}
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -58,7 +59,9 @@ GENPROT( gemm_packa )
GENPROT( gemm_packb )
GENPROT( gemm_ker_var1 )
GENPROT( gemm_ker_var2 )
GENPROT( gemm_ker_var2sl )
GENPROT( gemm_ker_var2rr )
// Headers for induced algorithms:
GENPROT( gemm4mb_ker_var2 ) // 4m1b
@@ -90,7 +93,8 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
// Headers for induced algorithms:
INSERT_GENTPROT_BASIC0( gemm4mb_ker_var2 ) // 4m1b

View File

@@ -300,11 +300,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
if ( bli_is_last_iter_rr( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
if ( bli_is_last_iter_rr( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\

View File

@@ -0,0 +1,555 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_herk_l_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, ip; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely above the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region above where the diagonal of C intersects
the left edge of the panel, adjust the pointer to C and A and treat
this case as if the diagonal offset were zero. */ \
if ( diagoffc < 0 ) \
{ \
ip = -diagoffc / MR; \
i = ip * MR; \
m = m - i; \
diagoffc = -diagoffc % MR; \
c_cast = c_cast + (i )*rs_c; \
a_cast = a_cast + (ip )*ps_a; \
} \
\
/* If there is a zero region to the right of where the diagonal
of C intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffc + m < n ) \
{ \
n = diagoffc + m; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of C, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. Any remainder from this integer division is discarded, which
is what we want. That is, we want the rectangular region to contain
as many columns of whole microtiles as possible without including any
microtiles that intersect the diagonal. The number of iterations in
the triangular (or trapezoidal) region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_rct = diagoffc / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
loops for the initial rectangular region of C (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and
1st loops for the remaining triangular region of C. */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the triangular region
by the number of iterations used for the rectangular region. */ \
jr_start += n_iter_rct; \
jr_end += n_iter_rct; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr )

View File

@@ -57,10 +57,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_herk_l_ker_var2
void bli_herk_l_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -322,10 +325,8 @@ void PASTEMAC(ch,varname) \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
/* Use slab assignment of micropanels to threads in the 2nd and 1st
loops for the initial rectangular region of C (if it exists). */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
@@ -360,11 +361,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -425,8 +426,9 @@ void PASTEMAC(ch,varname) \
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd loop for the remaining triangular region of C. */ \
/* Use round-robin assignment of micropanels to threads in the 2nd
loop and slab partitioning in the 1st loop for the remaining
triangular region of C. */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the triangular region
@@ -464,11 +466,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -550,5 +552,5 @@ void PASTEMAC(ch,varname) \
} \
}
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl )

View File

@@ -0,0 +1,557 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_herk_u_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, jp; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely below the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region to the left of where the diagonal of C
intersects the top edge of the panel, adjust the pointer to C and B
and treat this case as if the diagonal offset were zero.
NOTE: It's possible that after this pruning that the diagonal offset
is still positive (though it is guaranteed to be less than NR). */ \
if ( diagoffc > 0 ) \
{ \
jp = diagoffc / NR; \
j = jp * NR; \
n = n - j; \
diagoffc = diagoffc % NR; \
c_cast = c_cast + (j )*cs_c; \
b_cast = b_cast + (jp )*ps_b; \
} \
\
/* If there is a zero region below where the diagonal of C intersects
the right edge of the panel, shrink it to prevent "no-op" iterations
from executing. */ \
if ( -diagoffc + n < m ) \
{ \
m = -diagoffc + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in C. A non-zero remainder means we need to
add one additional iteration. That is, we want the triangular region
to contain as few columns of whole microtiles as possible while still
including all microtiles that intersect the diagonal. The number of
iterations in the rectangular region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
loops for the initial triangular region of C (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
loops for the remaining triangular region of C. */ \
bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr )

View File

@@ -57,10 +57,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_herk_u_ker_var2
void bli_herk_u_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -118,7 +121,7 @@ void bli_herk_u_ker_var2
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
@@ -324,10 +327,9 @@ void PASTEMAC(ch,varname) \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use interleaved (round robin) assignment of micropanels to threads in the
2nd loop for the initial triangular region of C (if it exists). For both
the rectangular and triangular regions, use contiguous assignment for the
1st loop. */ \
/* Use round-robin assignment of micropanels to threads in the 2nd loop
and slab partitioning in the 1st loop for the initial triangular region
of C (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
@@ -361,11 +363,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -449,8 +451,8 @@ void PASTEMAC(ch,varname) \
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the remaining triangular region of C. */ \
/* Use slab assignment of micropanels to threads in the 2nd and 1st loops
loop for the remaining triangular region of C. */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
@@ -489,11 +491,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -552,5 +554,5 @@ void PASTEMAC(ch,varname) \
} \
}
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -55,9 +56,13 @@ void PASTEMAC0(opname) \
//GENPROT( herk_blk_var2 )
//GENPROT( herk_blk_var3 )
GENPROT( herk_x_ker_var2 )
GENPROT( herk_l_ker_var2 )
GENPROT( herk_u_ker_var2 )
GENPROT( herk_x_ker_var2sl )
GENPROT( herk_x_ker_var2rr )
GENPROT( herk_l_ker_var2sl )
GENPROT( herk_l_ker_var2rr )
GENPROT( herk_u_ker_var2sl )
GENPROT( herk_u_ker_var2rr )
//GENPROT( herk_packa )
//GENPROT( herk_packb )
@@ -89,6 +94,8 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( herk_l_ker_var2 )
INSERT_GENTPROT_BASIC0( herk_u_ker_var2 )
INSERT_GENTPROT_BASIC0( herk_l_ker_var2sl )
INSERT_GENTPROT_BASIC0( herk_l_ker_var2rr )
INSERT_GENTPROT_BASIC0( herk_u_ker_var2sl )
INSERT_GENTPROT_BASIC0( herk_u_ker_var2rr )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,12 +35,12 @@
#include "blis.h"
static gemm_var_oft vars[2] =
static gemm_var_oft vars_sl[2] =
{
bli_herk_l_ker_var2, bli_herk_u_ker_var2,
bli_herk_l_ker_var2sl, bli_herk_u_ker_var2sl,
};
void bli_herk_x_ker_var2
void bli_herk_x_ker_var2sl
(
obj_t* a,
obj_t* ah,
@@ -58,7 +59,48 @@ void bli_herk_x_ker_var2
else uplo = 1;
// Index into the variant array to extract the correct function pointer.
f = vars[uplo];
f = vars_sl[uplo];
// Call the macrokernel.
f
(
a,
ah,
c,
cntx,
rntm,
cntl,
thread
);
}
// -----------------------------------------------------------------------------
static gemm_var_oft vars_rr[2] =
{
bli_herk_l_ker_var2rr, bli_herk_u_ker_var2rr,
};
void bli_herk_x_ker_var2rr
(
obj_t* a,
obj_t* ah,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bool_t uplo;
gemm_var_oft f;
// Set a bool based on the uplo field of C's root object.
if ( bli_obj_root_is_lower( c ) ) uplo = 0;
else uplo = 1;
// Index into the variant array to extract the correct function pointer.
f = vars_rr[uplo];
// Call the macrokernel.
f

View File

@@ -0,0 +1,535 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_ll_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists).
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2rr )

View File

@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_ll_ker_var2
void bli_trmm_ll_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -333,10 +336,9 @@ void PASTEMAC(ch,varname) \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
/* Use slab assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists).
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
@@ -393,11 +395,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -457,17 +459,18 @@ void PASTEMAC(ch,varname) \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -524,9 +527,9 @@ void PASTEMAC(ch,varname) \
c11 += rstep_c; \
} \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2sl )

View File

@@ -0,0 +1,542 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_lu_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely below the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly above the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2rr )

View File

@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_lu_ker_var2
void bli_trmm_lu_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -340,10 +343,8 @@ void PASTEMAC(ch,varname) \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
/* Use slab assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
@@ -393,17 +394,18 @@ void PASTEMAC(ch,varname) \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -463,17 +465,18 @@ void PASTEMAC(ch,varname) \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -531,9 +534,9 @@ void PASTEMAC(ch,varname) \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2sl )

View File

@@ -0,0 +1,598 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_rl_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of B, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. (There should never be any remainder in this division.) The
number of iterations in the triangular (or trapezoidal) region is
computed as the remaining number of iterations in the n dimension. */ \
n_iter_rct = diagoffb / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
loops for the initial rectangular region of B (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the remaining triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
/* Advance the starting b1 and c1 pointers to the positions corresponding
to the start of the triangular region of B. */ \
jr_start = n_iter_rct; \
b1 = b_cast + jr_start * cstep_b; \
c1 = c_cast + jr_start * cstep_c; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2rr )

View File

@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_rl_ker_var2
void bli_trmm_rl_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -360,10 +363,8 @@ void PASTEMAC(ch,varname) \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of B (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
/* Use slab assignment of micropanels to threads in the 2nd and 1st
loops for the initial rectangular region of B (if it exists). */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
@@ -399,11 +400,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -455,12 +456,12 @@ void PASTEMAC(ch,varname) \
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd loop for the remaining triangular region of B (if it exists).
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the remaining triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr loop but skip all but the pointer increment for iterations
that are not assigned to it. */ \
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
/* Advance the starting b1 and c1 pointers to the positions corresponding
to the start of the triangular region of B. */ \
@@ -522,11 +523,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -589,9 +590,9 @@ void PASTEMAC(ch,varname) \
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2sl )

View File

@@ -0,0 +1,618 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_ru_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j, jb0; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in B. (There should never be any remainder
in this division.) The number of iterations in the rectangular region
is computed as the remaining number of iterations in the n dimension. */ \
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the initial triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter_tri; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
loops the remaining triangular region of B. */ \
bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
jb0 = n_iter_tri; \
\
/* Save the resulting value of b1 from the previous loop since it represents
the starting point for the rectangular region. */ \
b_cast = b1; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
/* NOTE: We must index through b_cast differently since it contains
the starting address of the rectangular region (which is already
n_iter_tri logical iterations through B). */ \
b1 = b_cast + (j-jb0) * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
\
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2rr )

View File

@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_ru_ker_var2
void bli_trmm_ru_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -365,12 +368,12 @@ void PASTEMAC(ch,varname) \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd loop for the initial triangular region of B (if it exists).
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the initial triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr loop but skip all but the pointer increment for iterations
that are not assigned to it. */ \
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
b1 = b_cast; \
c1 = c_cast; \
@@ -428,11 +431,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -498,8 +501,8 @@ void PASTEMAC(ch,varname) \
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use contiguous assignment of micropanels to threads in both the 2nd and
1st loops the remaining triangular region of B. */ \
/* Use slab assignment of micropanels to threads in the 2nd and 1st
loops the remaining triangular region of B. */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
@@ -552,11 +555,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -607,9 +610,9 @@ void PASTEMAC(ch,varname) \
\
\
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2sl )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -55,11 +56,17 @@ void PASTEMAC0(opname) \
//GENPROT( trmm_blk_var2 )
//GENPROT( trmm_blk_var3 )
GENPROT( trmm_xx_ker_var2 )
GENPROT( trmm_ll_ker_var2 )
GENPROT( trmm_lu_ker_var2 )
GENPROT( trmm_rl_ker_var2 )
GENPROT( trmm_ru_ker_var2 )
GENPROT( trmm_xx_ker_var2sl )
GENPROT( trmm_xx_ker_var2rr )
GENPROT( trmm_ll_ker_var2sl )
GENPROT( trmm_ll_ker_var2rr )
GENPROT( trmm_lu_ker_var2sl )
GENPROT( trmm_lu_ker_var2rr )
GENPROT( trmm_rl_ker_var2sl )
GENPROT( trmm_rl_ker_var2rr )
GENPROT( trmm_ru_ker_var2sl )
GENPROT( trmm_ru_ker_var2rr )
//
@@ -89,8 +96,12 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2rr )
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2rr )
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2rr )
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2sl )
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2rr )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,13 +35,13 @@
#include "blis.h"
static gemm_var_oft vars[2][2] =
static gemm_var_oft vars_sl[2][2] =
{
{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
{ bli_trmm_ll_ker_var2sl, bli_trmm_lu_ker_var2sl },
{ bli_trmm_rl_ker_var2sl, bli_trmm_ru_ker_var2sl }
};
void bli_trmm_xx_ker_var2
void bli_trmm_xx_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -72,7 +73,62 @@ void bli_trmm_xx_ker_var2
}
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo];
f = vars_sl[side][uplo];
// Call the macrokernel.
f
(
a,
b,
c,
cntx,
rntm,
cntl,
thread
);
}
// -----------------------------------------------------------------------------
static gemm_var_oft vars_rr[2][2] =
{
{ bli_trmm_ll_ker_var2rr, bli_trmm_lu_ker_var2rr },
{ bli_trmm_rl_ker_var2rr, bli_trmm_ru_ker_var2rr }
};
void bli_trmm_xx_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bool_t side;
bool_t uplo;
gemm_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular
// matrix's root object (whether that is matrix A or matrix B).
if ( bli_obj_root_is_triangular( a ) )
{
side = 0;
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
else uplo = 1;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
side = 1;
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
else uplo = 1;
}
// Index into the variant array to extract the correct function pointer.
f = vars_rr[side][uplo];
// Call the macrokernel.
f

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -53,7 +54,28 @@ cntl_t* bli_trsm_l_cntl_create
pack_t schema_b
)
{
void* macro_kernel_p = bli_trsm_xx_ker_var2;
void* macro_kernel_p;
void* packa_fp;
void* packb_fp;
#ifdef BLIS_ENABLE_JRIR_SLAB
// Use the function pointer to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_p = bli_trsm_xx_ker_var2sl;
packa_fp = bli_packm_blk_var1sl;
packb_fp = bli_packm_blk_var1sl;
#else // BLIS_ENABLE_JRIR_RR
// Use the function pointer to the macrokernels that use round-robin
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_p = bli_trsm_xx_ker_var2rr;
packa_fp = bli_packm_blk_var1rr;
packb_fp = bli_packm_blk_var1rr;
#endif
const opid_t family = BLIS_TRSM;
@@ -78,7 +100,7 @@ cntl_t* bli_trsm_l_cntl_create
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
bli_trsm_packa,
bli_packm_blk_var1,
packa_fp,
BLIS_MR,
BLIS_MR,
TRUE, // do NOT invert diagonal
@@ -102,7 +124,7 @@ cntl_t* bli_trsm_l_cntl_create
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
bli_trsm_packb,
bli_packm_blk_var1,
packb_fp,
BLIS_MR,
BLIS_NR,
FALSE, // do NOT invert diagonal
@@ -140,7 +162,16 @@ cntl_t* bli_trsm_r_cntl_create
pack_t schema_b
)
{
void* macro_kernel_p = bli_trsm_xx_ker_var2;
// trsm macrokernels are presently disabled for right-side execution,
// so it doesn't matter which function pointer we use here (sl or rr).
// To be safe, we'll insert an abort() guard to alert the developers
// of this should right-side macrokernels ever be re-enabled.
void* macro_kernel_p = bli_trsm_xx_ker_var2sl;
void* packa_fp = bli_packm_blk_var1sl;
void* packb_fp = bli_packm_blk_var1sl;
bli_abort();
const opid_t family = BLIS_TRSM;
@@ -165,7 +196,7 @@ cntl_t* bli_trsm_r_cntl_create
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
bli_trsm_packa,
bli_packm_blk_var1,
packa_fp,
BLIS_NR,
BLIS_MR,
FALSE, // do NOT invert diagonal
@@ -189,7 +220,7 @@ cntl_t* bli_trsm_r_cntl_create
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
bli_trsm_packb,
bli_packm_blk_var1,
packb_fp,
BLIS_MR,
BLIS_MR,
TRUE, // do NOT invert diagonal

View File

@@ -0,0 +1,605 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trsm_ll_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
dim_t off_a11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a10 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a10 = k_a1011 - MR; \
off_a11 = k_a10; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
*/ \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
( double* )c, 1, cs_c, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2rr )

View File

@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trsm_ll_ker_var2
void bli_trsm_ll_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -352,7 +355,9 @@ void PASTEMAC(ch,varname) \
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
/* Use slab assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
@@ -419,11 +424,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -483,11 +488,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -596,5 +601,5 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1,
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2sl )

View File

@@ -0,0 +1,586 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trsm_lu_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
dim_t off_a11; \
dim_t off_a12; \
dim_t i, j, ib; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a11 = diagoffa_i; \
k_a1112 = k - off_a11;; \
k_a11 = MR; \
k_a12 = k_a1112 - MR; \
off_a12 = off_a11 + k_a11; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 -= rstep_c; \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
printf( "m_iter = %lu\n", m_iter ); \
printf( "m_cur = %lu\n", m_cur ); \
printf( "k = %lu\n", k ); \
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
printf( "off_a1112 = %lu\n", off_a1112 ); \
printf( "k_a1112 = %lu\n", k_a1112 ); \
printf( "k_a12 = %lu\n", k_a12 ); \
printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2rr )

View File

@@ -55,10 +55,13 @@ typedef void (*FUNCPTR_T)
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trsm_lu_ker_var2
void bli_trsm_lu_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -360,7 +363,9 @@ void PASTEMAC(ch,varname) \
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
/* Use slab assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
@@ -429,11 +434,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -493,11 +498,11 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -577,5 +582,5 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur,
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2sl )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -441,12 +442,12 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
@@ -521,12 +522,12 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -434,12 +435,12 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
@@ -514,12 +515,12 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -57,9 +58,14 @@ GENPROT( trsm_blk_var3 )
GENPROT( trsm_packa )
GENPROT( trsm_packb )
GENPROT( trsm_xx_ker_var2 )
GENPROT( trsm_ll_ker_var2 )
GENPROT( trsm_lu_ker_var2 )
GENPROT( trsm_xx_ker_var2sl )
GENPROT( trsm_xx_ker_var2rr )
GENPROT( trsm_ll_ker_var2sl )
GENPROT( trsm_ll_ker_var2rr )
GENPROT( trsm_lu_ker_var2sl )
GENPROT( trsm_lu_ker_var2rr )
GENPROT( trsm_rl_ker_var2 )
GENPROT( trsm_ru_ker_var2 )
@@ -91,8 +97,11 @@ void PASTEMAC(ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2sl )
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2rr )
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2sl )
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2rr )
INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,13 +35,13 @@
#include "blis.h"
static trsm_var_oft vars[2][2] =
static trsm_var_oft vars_sl[2][2] =
{
{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
{ bli_trsm_ll_ker_var2sl, bli_trsm_lu_ker_var2sl },
{ bli_trsm_rl_ker_var2 , bli_trsm_ru_ker_var2 }
};
void bli_trsm_xx_ker_var2
void bli_trsm_xx_ker_var2sl
(
obj_t* a,
obj_t* b,
@@ -72,7 +73,62 @@ void bli_trsm_xx_ker_var2
}
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo];
f = vars_sl[side][uplo];
// Call the macrokernel.
f
(
a,
b,
c,
cntx,
rntm,
cntl,
thread
);
}
// -----------------------------------------------------------------------------
static trsm_var_oft vars_rr[2][2] =
{
{ bli_trsm_ll_ker_var2rr, bli_trsm_lu_ker_var2rr },
{ bli_trsm_rl_ker_var2 , bli_trsm_ru_ker_var2 }
};
void bli_trsm_xx_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bool_t side;
bool_t uplo;
trsm_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular
// matrix's root object (whether that is matrix A or matrix B).
if ( bli_obj_root_is_triangular( a ) )
{
side = 0;
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
else uplo = 1;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
side = 1;
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
else uplo = 1;
}
// Index into the variant array to extract the correct function pointer.
f = vars_rr[side][uplo];
// Call the macrokernel.
f

View File

@@ -792,14 +792,16 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left )
( i != 0 || n_left == 0 );
}
static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
static bool_t bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
{
return ( bool_t )
#ifdef BLIS_JRIR_INTERLEAVE
( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
#else
( i == end_iter - 1 );
#endif
}
static bool_t bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
{
return ( bool_t )
( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
}

View File

@@ -232,8 +232,6 @@ void bli_thread_init_rntm_from_env( rntm_t* rntm );
// -----------------------------------------------------------------------------
//printf( "bli_thread_range_jrir: inlv: th%d: start end inc: %d %d %d\n", (int)bli_thread_work_id( thread ), (int)*start, (int)*end, (int)*inc );
static void bli_thread_range_jrir_rr
(
thrinfo_t* thread,
@@ -242,7 +240,7 @@ static void bli_thread_range_jrir_rr
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
dim_t* inc
)
{
// Use interleaved partitioning of jr/ir loops.
@@ -259,7 +257,7 @@ static void bli_thread_range_jrir_sl
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
dim_t* inc
)
{
// Use contiguous slab partitioning of jr/ir loops.
@@ -267,6 +265,7 @@ static void bli_thread_range_jrir_sl
*inc = 1;
}
#if 0
static void bli_thread_range_jrir
(
thrinfo_t* thread,
@@ -275,14 +274,13 @@ static void bli_thread_range_jrir
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
dim_t* inc
)
{
//#ifdef BLIS_JRIR_INTERLEAVE
#if 0
bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
#else
#ifdef BLIS_ENABLE_JRIR_SLAB
bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
#else
bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
#endif
}
@@ -297,45 +295,30 @@ static void bli_thread_range_weighted_jrir
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
dim_t* inc
)
{
#ifdef BLIS_JRIR_INTERLEAVE
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = bli_thread_n_way( thread );
*end = n;
#else
#ifdef BLIS_ENABLE_JRIR_SLAB
// Use contiguous slab partitioning for jr/ir loops.
bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
handle_edge_low, start, end );
handle_edge_low, start, end );
*start = *start / bf; *inc = 1;
if ( *end % bf ) *end = *end / bf + 1;
else *end = *end / bf;
#endif
#else
#if 0
const dim_t n_way = bli_thread_n_way( thread );
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = bli_thread_n_way( thread );
*end = n;
if ( m * n / n_way > 25000 )
{
// Use contiguous slab partitioning for jr/ir loops.
bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
handle_edge_low, start, end );
*inc = 1;
}
else
{
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = n_way; //bli_thread_n_way( thread );
*end = n;
}
#endif
}
#endif
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -54,7 +55,28 @@ cntl_t* blx_gemmbp_cntl_create
pack_t schema_b
)
{
void* macro_kernel_p = blx_gemm_ker_var2;
void* macro_kernel_fp;
void* packa_fp;
void* packb_fp;
#ifdef BLIS_ENABLE_JRIR_SLAB
// Use the function pointers to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_fp = blx_gemm_ker_var2sl;
packa_fp = bli_packm_blk_var1sl;
packb_fp = bli_packm_blk_var1sl;
#else // BLIS_ENABLE_JRIR_RR
// Use the function pointers to the macrokernels that use round-robin
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_fp = bli_gemm_ker_var2rr;
packa_fp = bli_packm_blk_var1rr;
packb_fp = bli_packm_blk_var1rr;
#endif
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
@@ -69,7 +91,7 @@ cntl_t* blx_gemmbp_cntl_create
(
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
macro_kernel_fp,
gemm_cntl_bu_ke
);
@@ -77,7 +99,7 @@ cntl_t* blx_gemmbp_cntl_create
cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node
(
blx_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
packa_fp,
BLIS_MR,
BLIS_KR,
FALSE, // do NOT invert diagonal
@@ -101,7 +123,7 @@ cntl_t* blx_gemmbp_cntl_create
cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node
(
blx_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
packb_fp,
BLIS_KR,
BLIS_NR,
FALSE, // do NOT invert diagonal

View File

@@ -59,14 +59,14 @@ typedef void (*gemm_fp)
// Function pointer array for datatype-specific functions.
static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
{
PASTECH2(blx_,s,gemm_ker_var2),
PASTECH2(blx_,c,gemm_ker_var2),
PASTECH2(blx_,d,gemm_ker_var2),
PASTECH2(blx_,z,gemm_ker_var2)
PASTECH2(blx_,s,gemm_ker_var2rr),
PASTECH2(blx_,c,gemm_ker_var2rr),
PASTECH2(blx_,d,gemm_ker_var2rr),
PASTECH2(blx_,z,gemm_ker_var2rr)
};
void blx_gemm_ker_var2
void blx_gemm_ker_var2rr
(
obj_t* a,
obj_t* b,
@@ -272,8 +272,8 @@ void PASTECH2(blx_,ch,varname) \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
@@ -302,11 +302,11 @@ void PASTECH2(blx_,ch,varname) \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -363,11 +363,11 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c,
}
#if 0
GENTFUNC( float, s, gemm_ker_var2 )
GENTFUNC( double, d, gemm_ker_var2 )
GENTFUNC( scomplex, c, gemm_ker_var2 )
GENTFUNC( dcomplex, z, gemm_ker_var2 )
GENTFUNC( float, s, gemm_ker_var2rr )
GENTFUNC( double, d, gemm_ker_var2rr )
GENTFUNC( scomplex, c, gemm_ker_var2rr )
GENTFUNC( dcomplex, z, gemm_ker_var2rr )
#else
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )
#endif

View File

@@ -0,0 +1,373 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "blix.h"
// Function pointer type for datatype-specific functions.
typedef void (*gemm_fp)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
// Function pointer array for datatype-specific functions.
static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
{
PASTECH2(blx_,s,gemm_ker_var2sl),
PASTECH2(blx_,c,gemm_ker_var2sl),
PASTECH2(blx_,d,gemm_ker_var2sl),
PASTECH2(blx_,z,gemm_ker_var2sl)
};
void blx_gemm_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
gemm_fp f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(blx_,ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
#if 0
GENTFUNC( float, s, gemm_ker_var2sl )
GENTFUNC( double, d, gemm_ker_var2sl )
GENTFUNC( scomplex, c, gemm_ker_var2sl )
GENTFUNC( dcomplex, z, gemm_ker_var2sl )
#else
INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -57,7 +58,8 @@ GENPROT( gemm_blk_var3 )
GENPROT( gemm_packa )
GENPROT( gemm_packb )
GENPROT( gemm_ker_var2 )
GENPROT( gemm_ker_var2sl )
GENPROT( gemm_ker_var2rr )
//
// Prototype BLAS-like interfaces with void pointer operands.
@@ -85,5 +87,6 @@ void PASTECH2(blx_,ch,varname) \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )