mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Support multithreading within the sup framework.
Details: - Added multithreading support to the sup framework (via either OpenMP or pthreads). Both variants 1n and 2m now have the appropriate threading infrastructure, including data partitioning logic, to parallelize computation. This support handles all four combinations of packing on matrices A and B (neither, A only, B only, or both). This implementation tries to be a little smarter when automatic threading is requested (e.g. via BLIS_NUM_THREADS) in that it will recalculate the factorization in units of micropanels (rather than using the raw dimensions) in bli_l3_sup_int.c, when the final problem shape is known and after threads have already been spawned. - Implemented bli_?packm_sup_var2(), which packs to conventional row- or column-stored matrices. (This is used for the rrc and crc storage cases.) Previously, copym was used, but that would no longer suffice because it could not be parallelized. - Minor reorganization of packing-related sup functions. Specifically, bli_packm_sup_init_mem_[ab]() are called from within packm_sup_[ab]() instead of from the variant functions. This has the effect of making the variant functions more readable. - Added additional bli_thrinfo_set_*() static functions to bli_thrinfo.h and inserted usage of these functions within bli_thrinfo_init(), which previously was accessing thrinfo_t fields via the -> operator. - Renamed bli_partition_2x2() to bli_thread_partition_2x2(). - Added an auto_factor field to the rntm_t struct in order to track whether automatic thread factorization was originally requested. - Added new test drivers in test/supmt that perform multithreaded sup tests, as well as appropriate octave/matlab scripts to plot the resulting output files. - Added additional language to docs/Multithreading.md to make it clear that specifying any BLIS_*_NT variable, even if it is set to 1, will be considered manual specification for the purposes of determining whether to auto-factorize via BLIS_NUM_THREADS. - Minor comment updates. AMD-Internal: [CPUPL-713] Change-Id: I9536648e7befac4d2dc17805e44ef34470961662
This commit is contained in:
committed by
dzambare
parent
a03a0f8f70
commit
2839b88f00
@@ -43,7 +43,6 @@ err_t bli_gemmsup_int
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
@@ -89,45 +88,86 @@ err_t bli_gemmsup_int
|
||||
stor_id == BLIS_CRR );
|
||||
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
|
||||
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||
: is_rcc_crc_ccr_ccc );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
|
||||
const bool_t auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
bool_t use_bp = TRUE;
|
||||
dim_t jc_new;
|
||||
dim_t ic_new;
|
||||
|
||||
|
||||
if ( is_primary )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t mu = m / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
if ( mu >= nu )
|
||||
//if ( m % 2 == 1 && n % 2 == 1 )
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
|
||||
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else // if ( mu < nu )
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
|
||||
bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
stor_id, cntx, rntm, thread );
|
||||
// *requires nudging of nc up to be a multiple of mr.
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -136,35 +176,64 @@ err_t bli_gemmsup_int
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
|
||||
const dim_t mt = bli_obj_width( c );
|
||||
const dim_t nt = bli_obj_length( c );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t mu = mt / MR;
|
||||
const dim_t nu = nt / NR;
|
||||
const dim_t mu = n / MR; // the n becomes m after a transposition
|
||||
const dim_t nu = m / NR; // the m becomes n after a transposition
|
||||
|
||||
if ( mu >= nu )
|
||||
//if ( mt % 2 == 1 && nt % 2 == 1 )
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m non-primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
|
||||
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else // if ( mu < nu )
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n non-primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
|
||||
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
stor_id, cntx, rntm, thread );
|
||||
// *requires nudging of mc up to be a multiple of nr.
|
||||
}
|
||||
// *requires nudging of mc,nc up to be a multiple of nr,mr.
|
||||
}
|
||||
|
||||
// Return success so that the caller knows that we computed the solution.
|
||||
|
||||
@@ -41,6 +41,5 @@ err_t bli_gemmsup_int
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
@@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
@@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \
|
||||
dim_t mr, \
|
||||
dim_t* restrict m_max, \
|
||||
dim_t* restrict k_max, \
|
||||
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
@@ -97,8 +96,11 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a )
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
trans_t transc, \
|
||||
dim_t m_alloc, \
|
||||
dim_t k_alloc, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
@@ -107,6 +109,7 @@ void PASTEMAC(ch,opname) \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
@@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
@@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \
|
||||
dim_t nr, \
|
||||
dim_t* restrict k_max, \
|
||||
dim_t* restrict n_max, \
|
||||
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
@@ -97,16 +96,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b )
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
trans_t transc, \
|
||||
dim_t k_alloc, \
|
||||
dim_t n_alloc, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict x, inc_t rs_x, inc_t cs_x, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
@@ -327,3 +327,137 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it; \
|
||||
dim_t vector_len; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t incp, ldp; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( &rs_c, &cs_c ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
bool_t col_stored = bli_is_col_packed( schema ); \
|
||||
/*bool_t row_stored = bli_is_row_packed( schema );*/ \
|
||||
\
|
||||
if ( col_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to a column-stored matrix. */ \
|
||||
iter_dim = n; \
|
||||
vector_len = m; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
incp = 1; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
else /* if ( row_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to a row-stored matrix. */ \
|
||||
iter_dim = m; \
|
||||
vector_len = n; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
incp = 1; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim; \
|
||||
\
|
||||
\
|
||||
ctype* restrict p_begin = p_cast; \
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
|
||||
( void )nt; \
|
||||
( void )tid; \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( it = 0; it < n_iter; it += 1 ) \
|
||||
{ \
|
||||
ctype* restrict c_begin = c_cast + (it )*ldc; \
|
||||
\
|
||||
ctype* restrict c_use = c_begin; \
|
||||
ctype* restrict p_use = p_begin; \
|
||||
\
|
||||
{ \
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
conjc, \
|
||||
vector_len, \
|
||||
kappa_cast, \
|
||||
c_use, incc, \
|
||||
p_use, incp, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += ldp; \
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
if ( !row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( packm, packm_sup_var2 )
|
||||
|
||||
|
||||
@@ -58,3 +58,21 @@ void PASTEMAC(ch,varname) \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_var1 )
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_var2 )
|
||||
|
||||
|
||||
@@ -360,8 +360,6 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
/*
|
||||
const inc_t jrstep_a = rs_a * MR; \
|
||||
( void )jrstep_a; \
|
||||
*/ \
|
||||
\
|
||||
const inc_t irstep_c = cs_c * NR; \
|
||||
const inc_t irstep_b = cs_b * NR; \
|
||||
@@ -462,33 +460,56 @@ void PASTEMAC(ch,varname) \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix B, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_a) \
|
||||
( \
|
||||
packa, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to a "panel of B". */ \
|
||||
stor_id, \
|
||||
NC, KC, MR, /* Note this "panel of B" is NC x KC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
); \
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree.
|
||||
NOTE: These bszid_t values, and their order, match that of the bp
|
||||
algorithm (variant 2) because they are not used to query actual
|
||||
blocksizes but rather query the ways of parallelism for the various
|
||||
loops. For example, the 2nd loop in variant 1 partitions in the m
|
||||
dimension (in increments of MR), but parallelizes that m dimension
|
||||
with BLIS_JR_NT. The only difference is that the _packa and _packb
|
||||
arrays have been adjusted for the semantic difference in order in
|
||||
which packa and packb nodes are encountered in the thrinfo tree.
|
||||
That is, this panel-block algorithm partitions an NC x KC submatrix
|
||||
of A to be packed in the 4th loop, and a KC x MC submatrix of B
|
||||
to be packed in the 3rd loop. */ \
|
||||
/* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \
|
||||
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t* restrict bszids; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix B, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_b) \
|
||||
( \
|
||||
packb, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to a "block of A". */ \
|
||||
stor_id, \
|
||||
KC, MC, NR, /* Note this "block of A" is KC x MC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
); \
|
||||
/* Set the bszids pointer to the correct bszids array above based on which
|
||||
matrices (if any) are being packed. */ \
|
||||
if ( packa ) { if ( packb ) bszids = bszids_packab; \
|
||||
else bszids = bszids_packa; } \
|
||||
else { if ( packb ) bszids = bszids_packb; \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jc = bszids; \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t m_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = m_local % NC; \
|
||||
\
|
||||
/* Loop over the m dimension (NC rows/columns at a time). */ \
|
||||
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
|
||||
@@ -594,29 +615,48 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing A. If we won't be packing A, we alias to
|
||||
the _pc variables so that code further down can unconditionally
|
||||
reference the _pa variables. Note that *if* we will be packing
|
||||
A, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pa; \
|
||||
if ( packa ) { bszids_pa = &bszids_pc[1]; \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
|
||||
else { bszids_pa = &bszids_pc[0]; \
|
||||
thread_pa = thread_pc; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. (If A will not be packed, then a_use will be set to point to
|
||||
a and the _a_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
implementation based on the schema deduced from the stor_id.
|
||||
NOTE: packing matrix A in this panel-block algorithm corresponds
|
||||
to packing matrix B in the block-panel algorithm. */ \
|
||||
PASTEMAC(ch,packm_sup_a) \
|
||||
( \
|
||||
packa, \
|
||||
stor_id, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
|
||||
stor_id, /* a "panel of B". */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
NC, KC, /* This "panel of B" is (at most) NC x KC. */ \
|
||||
nc_cur, kc_cur, MR, \
|
||||
one, \
|
||||
&one_local, \
|
||||
a_pc, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
matrix A. */ \
|
||||
ctype* restrict a_pc_use = a_use; \
|
||||
\
|
||||
/* We don't need to embed the panel stride of A within the auxinfo_t
|
||||
@@ -624,6 +664,20 @@ void PASTEMAC(ch,varname) \
|
||||
which occurs here, within the macrokernel, not within the
|
||||
millikernel. */ \
|
||||
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_ic = &bszids_pa[1]; \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t n_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = n_local % MC; \
|
||||
\
|
||||
/* Loop over the n dimension (MC rows at a time). */ \
|
||||
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
|
||||
@@ -712,6 +766,20 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing A. If we won't be packing A, we alias to
|
||||
the _pc variables so that code further down can unconditionally
|
||||
reference the _pa variables. Note that *if* we will be packing
|
||||
A, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pb; \
|
||||
if ( packb ) { bszids_pb = &bszids_ic[1]; \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
|
||||
else { bszids_pb = &bszids_ic[0]; \
|
||||
thread_pb = thread_ic; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. (If B will not be packed, then b_use will be set to point to
|
||||
@@ -723,16 +791,19 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,packm_sup_b) \
|
||||
( \
|
||||
packb, \
|
||||
stor_id, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
|
||||
stor_id, /* a "block of A". */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
KC, MC, /* This "block of A" is (at most) KC x MC. */ \
|
||||
kc_cur, mc_cur, NR, \
|
||||
one, \
|
||||
&one_local, \
|
||||
b_ic, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias b_use so that it's clear this is our current block of
|
||||
@@ -744,6 +815,29 @@ void PASTEMAC(ch,varname) \
|
||||
micropanels of B. */ \
|
||||
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jr = &bszids_pb[1]; \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
|
||||
dim_t jr_left = nc_cur % MR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* An optimization: allow the last jr iteration to contain up to MRE
|
||||
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
|
||||
these cases.) Note that this prevents us from declaring jr_iter and
|
||||
jr_left as const. NOTE: We forgo this optimization when packing A
|
||||
since packing an extended edge case is not yet supported. */ \
|
||||
if ( !packa && !is_mt ) \
|
||||
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
|
||||
{ \
|
||||
jr_iter--; jr_left += MR; \
|
||||
} \
|
||||
\
|
||||
/* Loop over the m dimension (NR columns at a time). */ \
|
||||
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
|
||||
@@ -1194,33 +1288,45 @@ void PASTEMAC(ch,varname) \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix A, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_a) \
|
||||
( \
|
||||
packa, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to a "block of A". */ \
|
||||
stor_id, \
|
||||
MC, KC, MR, /* Note this "block of A" is MC x KC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
); \
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree. */ \
|
||||
/* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \
|
||||
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t* restrict bszids; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix B, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_b) \
|
||||
( \
|
||||
packb, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to a "panel of B". */ \
|
||||
stor_id, \
|
||||
KC, NC, NR, /* Note this "panel of B" is KC x NC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
); \
|
||||
/* Set the bszids pointer to the correct bszids array above based on which
|
||||
matrices (if any) are being packed. */ \
|
||||
if ( packa ) { if ( packb ) bszids = bszids_packab; \
|
||||
else bszids = bszids_packa; } \
|
||||
else { if ( packb ) bszids = bszids_packb; \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jc = bszids; \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t n_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
|
||||
@@ -1324,6 +1430,20 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing B. If we won't be packing B, we alias to
|
||||
the _pc variables so that code further down can unconditionally
|
||||
reference the _pb variables. Note that *if* we will be packing
|
||||
B, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pb; \
|
||||
if ( packb ) { bszids_pb = &bszids_pc[1]; \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
|
||||
else { bszids_pb = &bszids_pc[0]; \
|
||||
thread_pb = thread_pc; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. (If B will not be packed, then a_use will be set to point to
|
||||
@@ -1333,16 +1453,19 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,packm_sup_b) \
|
||||
( \
|
||||
packb, \
|
||||
stor_id, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
|
||||
stor_id, /* a "panel of B." */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
KC, NC, /* This "panel of B" is (at most) KC x NC. */ \
|
||||
kc_cur, nc_cur, NR, \
|
||||
one, \
|
||||
&one_local, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
@@ -1354,6 +1477,20 @@ void PASTEMAC(ch,varname) \
|
||||
which occurs here, within the macrokernel, not within the
|
||||
millikernel. */ \
|
||||
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_ic = &bszids_pb[1]; \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t m_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
|
||||
@@ -1440,6 +1577,20 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing B. If we won't be packing A, we alias to
|
||||
the _ic variables so that code further down can unconditionally
|
||||
reference the _pa variables. Note that *if* we will be packing
|
||||
A, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pa; \
|
||||
if ( packa ) { bszids_pa = &bszids_ic[1]; \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
|
||||
else { bszids_pa = &bszids_ic[0]; \
|
||||
thread_pa = thread_ic; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. (If A will not be packed, then a_use will be set to point to
|
||||
@@ -1449,16 +1600,19 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,packm_sup_a) \
|
||||
( \
|
||||
packa, \
|
||||
stor_id, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
|
||||
stor_id, /* a "block of A." */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
MC, KC, /* This "block of A" is (at most) MC x KC. */ \
|
||||
mc_cur, kc_cur, MR, \
|
||||
one, \
|
||||
&one_local, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
@@ -1469,6 +1623,30 @@ void PASTEMAC(ch,varname) \
|
||||
millikernel will query and use this to iterate through
|
||||
micropanels of A (if needed). */ \
|
||||
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jr = &bszids_pa[1]; \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* An optimization: allow the last jr iteration to contain up to NRE
|
||||
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
|
||||
these cases.) Note that this prevents us from declaring jr_iter and
|
||||
jr_left as const. NOTE: We forgo this optimization when packing B
|
||||
since packing an extended edge case is not yet supported. */ \
|
||||
if ( !packb && !is_mt ) \
|
||||
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
|
||||
{ \
|
||||
jr_iter--; jr_left += NR; \
|
||||
} \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
|
||||
|
||||
@@ -125,7 +125,7 @@ void bli_l3_thread_decorator
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass down the algorithmic function stack. Making thread-local
|
||||
// alaises is highly recommended in case a thread needs to change any
|
||||
// aliases is highly recommended in case a thread needs to change any
|
||||
// of the properties of an object without affecting other threads'
|
||||
// objects.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
|
||||
@@ -96,7 +96,7 @@ void* bli_l3_thread_entry( void* data_void )
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass down the algorithmic function stack. Making thread-local
|
||||
// alaises is highly recommended in case a thread needs to change any
|
||||
// aliases is highly recommended in case a thread needs to change any
|
||||
// of the properties of an object without affecting other threads'
|
||||
// objects.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
|
||||
@@ -48,7 +48,6 @@ typedef err_t (*l3supint_t)
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
@@ -57,8 +56,6 @@ err_t bli_l3_sup_thread_decorator
|
||||
(
|
||||
l3supint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
|
||||
@@ -40,16 +40,14 @@
|
||||
// Define a dummy function bli_l3_sup_thread_entry(), which is needed in the
|
||||
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
||||
// or no multithreading) we don't risk having an unresolved symbol.
|
||||
//void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
//#define PRINT_THRINFO
|
||||
|
||||
err_t bli_l3_sup_thread_decorator
|
||||
(
|
||||
l3supint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -59,36 +57,8 @@ err_t bli_l3_sup_thread_decorator
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
|
||||
return
|
||||
bli_gemmsup_int
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
0
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
//pack_t schema_a = bli_obj_pack_schema( a );
|
||||
//pack_t schema_b = bli_obj_pack_schema( b );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
// Query the total number of threads from the rntm_t object.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
@@ -99,55 +69,45 @@ err_t bli_l3_sup_thread_decorator
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
#if 0
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
|
||||
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
cntl_t* cntl_use = NULL;
|
||||
//thrinfo_t* thread = NULL;
|
||||
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
const dim_t tid = 0;
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
//bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
// a, b, c, rntm_p, cntl, &cntl_use );
|
||||
#if 0
|
||||
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
#endif
|
||||
|
||||
( void )tid;
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
@@ -158,23 +118,16 @@ err_t bli_l3_sup_thread_decorator
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Free the thread's local control tree.
|
||||
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
@@ -182,8 +135,6 @@ err_t bli_l3_sup_thread_decorator
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -37,12 +37,82 @@
|
||||
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// A data structure to assist in passing operands to additional threads.
|
||||
typedef struct thread_data
|
||||
{
|
||||
l3supint_t func;
|
||||
opid_t family;
|
||||
obj_t* alpha;
|
||||
obj_t* a;
|
||||
obj_t* b;
|
||||
obj_t* beta;
|
||||
obj_t* c;
|
||||
cntx_t* cntx;
|
||||
rntm_t* rntm;
|
||||
dim_t tid;
|
||||
thrcomm_t* gl_comm;
|
||||
array_t* array;
|
||||
} thread_data_t;
|
||||
|
||||
// Entry point for additional threads
|
||||
void* bli_l3_sup_thread_entry( void* data_void )
|
||||
{
|
||||
thread_data_t* data = data_void;
|
||||
|
||||
l3supint_t func = data->func;
|
||||
opid_t family = data->family;
|
||||
obj_t* alpha = data->alpha;
|
||||
obj_t* a = data->a;
|
||||
obj_t* b = data->b;
|
||||
obj_t* beta = data->beta;
|
||||
obj_t* c = data->c;
|
||||
cntx_t* cntx = data->cntx;
|
||||
rntm_t* rntm = data->rntm;
|
||||
dim_t tid = data->tid;
|
||||
array_t* array = data->array;
|
||||
thrcomm_t* gl_comm = data->gl_comm;
|
||||
|
||||
( void )family;
|
||||
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
err_t bli_l3_sup_thread_decorator
|
||||
(
|
||||
l3supint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -52,36 +122,8 @@ err_t bli_l3_sup_thread_decorator
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
|
||||
return
|
||||
bli_gemmsup_int
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
0
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
//pack_t schema_a = bli_obj_pack_schema( a );
|
||||
//pack_t schema_b = bli_obj_pack_schema( b );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
// Query the total number of threads from the context.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
@@ -92,91 +134,82 @@ err_t bli_l3_sup_thread_decorator
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
#if 0
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
// Allocate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
|
||||
// Allocate an array of pthread objects and auxiliary data structs to pass
|
||||
// to the thread entry functions.
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
|
||||
|
||||
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
|
||||
// can spawn all other threads before proceeding with its own computation.
|
||||
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
// Set up thread data for additional threads (beyond thread 0).
|
||||
datas[tid].func = func;
|
||||
datas[tid].family = family;
|
||||
datas[tid].alpha = alpha;
|
||||
datas[tid].a = a;
|
||||
datas[tid].b = b;
|
||||
datas[tid].beta = beta;
|
||||
datas[tid].c = c;
|
||||
datas[tid].cntx = cntx;
|
||||
datas[tid].rntm = rntm;
|
||||
datas[tid].tid = tid;
|
||||
datas[tid].gl_comm = gl_comm;
|
||||
datas[tid].array = array;
|
||||
|
||||
cntl_t* cntl_use = NULL;
|
||||
//thrinfo_t* thread = NULL;
|
||||
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
|
||||
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
//bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
// a, b, c, rntm_p, cntl, &cntl_use );
|
||||
#if 0
|
||||
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
#endif
|
||||
|
||||
( void )tid;
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Free the thread's local control tree.
|
||||
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
// Spawn additional threads for ids greater than 1.
|
||||
if ( tid != 0 )
|
||||
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_sup_thread_entry, &datas[tid] );
|
||||
else
|
||||
bli_l3_sup_thread_entry( ( void* )(&datas[0]) );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Thread 0 waits for additional threads to finish.
|
||||
for ( dim_t tid = 1; tid < n_threads; tid++ )
|
||||
{
|
||||
bli_pthread_join( pthreads[tid], NULL );
|
||||
}
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( pthreads );
|
||||
|
||||
#endif
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( datas );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
580
test/supmt/Makefile
Normal file
580
test/supmt/Makefile
Normal file
@@ -0,0 +1,580 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# Makefile
|
||||
#
|
||||
# Field G. Van Zee
|
||||
#
|
||||
# Makefile for standalone BLIS test drivers.
|
||||
#
|
||||
|
||||
#
|
||||
# --- Makefile PHONY target definitions ----------------------------------------
|
||||
#
|
||||
|
||||
.PHONY: all all-st all-mt \
|
||||
blis blis-st blis-mt \
|
||||
clean cleanx
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Determine makefile fragment location -------------------------------------
|
||||
#
|
||||
|
||||
# Comments:
|
||||
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
|
||||
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
|
||||
# the second case because CONFIG_NAME is not yet set.
|
||||
ifneq ($(strip $(BLIS_INSTALL_PATH)),)
|
||||
LIB_PATH := $(BLIS_INSTALL_PATH)/lib
|
||||
INC_PATH := $(BLIS_INSTALL_PATH)/include/blis
|
||||
SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
|
||||
else
|
||||
DIST_PATH := ../..
|
||||
LIB_PATH = ../../lib/$(CONFIG_NAME)
|
||||
INC_PATH = ../../include/$(CONFIG_NAME)
|
||||
SHARE_PATH := ../..
|
||||
endif
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Include common makefile definitions --------------------------------------
|
||||
#
|
||||
|
||||
# Include the common makefile fragment.
|
||||
-include $(SHARE_PATH)/common.mk
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- BLAS and LAPACK implementations ------------------------------------------
|
||||
#
|
||||
|
||||
# BLIS library and header path. This is simply wherever it was installed.
|
||||
#BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
|
||||
#BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
|
||||
|
||||
# BLIS library.
|
||||
#BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
|
||||
|
||||
# BLAS library path(s). This is where the BLAS libraries reside.
|
||||
HOME_LIB_PATH := $(HOME)/flame/lib
|
||||
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
|
||||
|
||||
# netlib BLAS
|
||||
NETLIB_LIB := $(HOME_LIB_PATH)/libblas.a
|
||||
|
||||
# OpenBLAS
|
||||
OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a
|
||||
OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a
|
||||
|
||||
# BLASFEO
|
||||
BLASFEO_LIB := $(HOME_LIB_PATH)/libblasfeo.a
|
||||
|
||||
# libxsmm
|
||||
LIBXSMM_LIB := $(HOME_LIB_PATH)/libxsmm.a -ldl \
|
||||
$(NETLIB_LIB) -lgfortran
|
||||
|
||||
# ATLAS
|
||||
ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
|
||||
$(HOME_LIB_PATH)/libatlas.a
|
||||
|
||||
# Eigen
|
||||
EIGEN_INC := $(HOME)/flame/eigen/include/eigen3
|
||||
EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a
|
||||
EIGENP_LIB := $(EIGEN_LIB)
|
||||
|
||||
# MKL
|
||||
MKL_LIB := -L$(MKL_LIB_PATH) \
|
||||
-lmkl_intel_lp64 \
|
||||
-lmkl_core \
|
||||
-lmkl_sequential \
|
||||
-lpthread -lm -ldl
|
||||
MKLP_LIB := -L$(MKL_LIB_PATH) \
|
||||
-lmkl_intel_lp64 \
|
||||
-lmkl_core \
|
||||
-lmkl_gnu_thread \
|
||||
-lpthread -lm -ldl -fopenmp
|
||||
#-L$(ICC_LIB_PATH) \
|
||||
#-lgomp
|
||||
|
||||
VENDOR_LIB := $(MKL_LIB)
|
||||
VENDORP_LIB := $(MKLP_LIB)
|
||||
|
||||
|
||||
#
|
||||
# --- Problem size definitions -------------------------------------------------
|
||||
#
|
||||
|
||||
# Single core
|
||||
PS_BEGIN := 4
|
||||
PS_MAX := 800
|
||||
PS_INC := 4
|
||||
|
||||
# Multicore
|
||||
P1_BEGIN := 8
|
||||
P1_MAX := 1600
|
||||
P1_INC := 8
|
||||
|
||||
|
||||
#
|
||||
# --- General build definitions ------------------------------------------------
|
||||
#
|
||||
|
||||
TEST_SRC_PATH := .
|
||||
TEST_OBJ_PATH := .
|
||||
|
||||
# Gather all local object files.
|
||||
TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
|
||||
$(TEST_OBJ_PATH)/%.o, \
|
||||
$(wildcard $(TEST_SRC_PATH)/*.c)))
|
||||
|
||||
# Override the value of CINCFLAGS so that the value of CFLAGS returned by
|
||||
# get-frame-cflags-for() is not cluttered up with include paths needed only
|
||||
# while building BLIS.
|
||||
CINCFLAGS := -I$(INC_PATH)
|
||||
|
||||
# Use the "framework" CFLAGS for the configuration family.
|
||||
CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
|
||||
|
||||
# Add local header paths to CFLAGS.
|
||||
CFLAGS += -I$(TEST_SRC_PATH)
|
||||
|
||||
# Locate the libblis library to which we will link.
|
||||
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
|
||||
# Define a set of CFLAGS for use with C++ and Eigen.
|
||||
CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS))
|
||||
CXXFLAGS += -I$(EIGEN_INC)
|
||||
|
||||
# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
|
||||
CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS))
|
||||
CXXFLAGS_MT := -march=native $(CXXFLAGS)
|
||||
|
||||
# Single or multithreaded string
|
||||
STR_ST := -DTHR_STR=\"st\"
|
||||
STR_MT := -DTHR_STR=\"mt\"
|
||||
|
||||
# Number of trials per problem size.
|
||||
N_TRIALS := -DN_TRIALS=3
|
||||
|
||||
# Problem size specification
|
||||
PDEF_ST := -DP_BEGIN=$(PS_BEGIN) \
|
||||
-DP_MAX=$(PS_MAX) \
|
||||
-DP_INC=$(PS_INC)
|
||||
|
||||
PDEF_MT := -DP_BEGIN=$(P1_BEGIN) \
|
||||
-DP_MAX=$(P1_MAX) \
|
||||
-DP_INC=$(P1_INC)
|
||||
|
||||
ifeq ($(E),1)
|
||||
ERRCHK := -DERROR_CHECK
|
||||
else
|
||||
ERRCHK := -DNO_ERROR_CHECK
|
||||
endif
|
||||
|
||||
# Enumerate possible datatypes and computation precisions.
|
||||
#dts := s d c z
|
||||
DTS := d
|
||||
|
||||
TRANS := n_n \
|
||||
n_t \
|
||||
t_n \
|
||||
t_t
|
||||
|
||||
# While BLIS supports all combinations of row and column storage for matrices
|
||||
# C, A, and B, the alternatives mostly only support CBLAS APIs, which inherently
|
||||
# support only "all row-storage" or "all column-storage". Thus, we disable the
|
||||
# building of those other drivers so that compilation/linking completes sooner.
|
||||
#STORS := r_r_r \
|
||||
# r_r_c \
|
||||
# r_c_r \
|
||||
# r_c_c \
|
||||
# c_r_r \
|
||||
# c_r_c \
|
||||
# c_c_r \
|
||||
# c_c_c
|
||||
STORS := r_r_r \
|
||||
c_c_c
|
||||
|
||||
|
||||
SHAPES := l_l_s \
|
||||
l_s_l \
|
||||
s_l_l \
|
||||
s_s_l \
|
||||
s_l_s \
|
||||
l_s_s \
|
||||
l_l_l
|
||||
|
||||
SMS := 6
|
||||
SNS := 8
|
||||
SKS := 10
|
||||
|
||||
|
||||
#
|
||||
# --- Function definitions -----------------------------------------------------
|
||||
#
|
||||
|
||||
# A function to strip the underscores from a list of strings.
|
||||
stripu = $(subst _,,$(1))
|
||||
|
||||
# Various functions that help us construct the datatype combinations and then
|
||||
# extract the needed datatype strings and C preprocessor define flags.
|
||||
get-1of2 = $(word 1,$(subst _, ,$(1)))
|
||||
get-2of2 = $(word 2,$(subst _, ,$(1)))
|
||||
|
||||
get-1of3 = $(word 1,$(subst _, ,$(1)))
|
||||
get-2of3 = $(word 2,$(subst _, ,$(1)))
|
||||
get-3of3 = $(word 3,$(subst _, ,$(1)))
|
||||
|
||||
# Datatype defs.
|
||||
get-dt-cpp = $(strip \
|
||||
$(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\
|
||||
$(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\
|
||||
$(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
|
||||
-DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
|
||||
|
||||
# Transpose defs.
|
||||
get-tra-defs-a = $(strip $(subst n,-DTRANSA=BLIS_NO_TRANSPOSE -DA_NOTRANS, \
|
||||
$(subst t,-DTRANSA=BLIS_TRANSPOSE -DA_TRANS,$(call get-1of2,$(1)))))
|
||||
get-tra-defs-b = $(strip $(subst n,-DTRANSB=BLIS_NO_TRANSPOSE -DB_NOTRANS, \
|
||||
$(subst t,-DTRANSB=BLIS_TRANSPOSE -DB_TRANS,$(call get-2of2,$(1)))))
|
||||
get-tra-defs = $(call get-tra-defs-a,$(1)) $(call get-tra-defs-b,$(1))
|
||||
|
||||
# Storage defs.
|
||||
get-sto-uch-a = $(strip $(subst r,R, \
|
||||
$(subst c,C,$(call get-1of3,$(1)))))
|
||||
get-sto-uch-b = $(strip $(subst r,R, \
|
||||
$(subst c,C,$(call get-2of3,$(1)))))
|
||||
get-sto-uch-c = $(strip $(subst r,R, \
|
||||
$(subst c,C,$(call get-3of3,$(1)))))
|
||||
get-sto-defs = $(strip \
|
||||
-DSTOR3=BLIS_$(call get-sto-uch-a,$(1))$(call get-sto-uch-b,$(1))$(call get-sto-uch-c,$(1)) \
|
||||
-DA_STOR_$(call get-sto-uch-a,$(1)) \
|
||||
-DB_STOR_$(call get-sto-uch-b,$(1)) \
|
||||
-DC_STOR_$(call get-sto-uch-c,$(1)))
|
||||
|
||||
# Dimension defs.
|
||||
get-shape-defs-cm = $(if $(findstring l,$(1)),-DM_DIM=-1,-DM_DIM=$(2))
|
||||
get-shape-defs-cn = $(if $(findstring l,$(1)),-DN_DIM=-1,-DN_DIM=$(2))
|
||||
get-shape-defs-ck = $(if $(findstring l,$(1)),-DK_DIM=-1,-DK_DIM=$(2))
|
||||
get-shape-defs-m = $(call get-shape-defs-cm,$(call get-1of3,$(1)),$(2))
|
||||
get-shape-defs-n = $(call get-shape-defs-cn,$(call get-2of3,$(1)),$(2))
|
||||
get-shape-defs-k = $(call get-shape-defs-ck,$(call get-3of3,$(1)),$(2))
|
||||
|
||||
# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk
|
||||
get-shape-defs = $(strip $(call get-shape-defs-m,$(1),$(2)) \
|
||||
$(call get-shape-defs-n,$(1),$(3)) \
|
||||
$(call get-shape-defs-k,$(1),$(4)))
|
||||
|
||||
#$(error l_l_s 6 8 4 = $(call get-shape-defs,l_l_s,6,8,4))
|
||||
|
||||
# Shape-dimension string.
|
||||
get-shape-str-ch = $(if $(findstring l,$(1)),p,$(2))
|
||||
get-shape-str-m = $(call get-shape-str-ch,$(call get-1of3,$(1)),$(2))
|
||||
get-shape-str-n = $(call get-shape-str-ch,$(call get-2of3,$(1)),$(2))
|
||||
get-shape-str-k = $(call get-shape-str-ch,$(call get-3of3,$(1)),$(2))
|
||||
|
||||
# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk
|
||||
get-shape-dim-str = m$(call get-shape-str-m,$(1),$(2))n$(call get-shape-str-n,$(1),$(3))k$(call get-shape-str-k,$(1),$(4))
|
||||
|
||||
# Implementation defs.
|
||||
# Define a function to return the appropriate -DSTR= and -D[BLIS|BLAS] flags.
|
||||
get-imp-defs = $(strip $(subst blissup,-DSTR=\"$(1)\" -DBLIS -DSUP, \
|
||||
$(subst blislpab,-DSTR=\"$(1)\" -DBLIS, \
|
||||
$(subst eigen,-DSTR=\"$(1)\" -DEIGEN, \
|
||||
$(subst openblas,-DSTR=\"$(1)\" -DCBLAS, \
|
||||
$(subst blasfeo,-DSTR=\"$(1)\" -DCBLAS, \
|
||||
$(subst libxsmm,-DSTR=\"$(1)\" -DBLAS -DXSMM, \
|
||||
$(subst vendor,-DSTR=\"$(1)\" -DCBLAS,$(1)))))))))
|
||||
|
||||
TRANS0 = $(call stripu,$(TRANS))
|
||||
STORS0 = $(call stripu,$(STORS))
|
||||
|
||||
# Limit BLAS and Eigen to only using all row-stored, or all column-stored matrices.
|
||||
# Also, limit libxsmm to using all column-stored matrices since it does not offer
|
||||
# CBLAS interfaces.
|
||||
BSTORS0 = rrr ccc
|
||||
ESTORS0 = rrr ccc
|
||||
XSTORS0 = ccc
|
||||
|
||||
|
||||
#
|
||||
# --- Object and binary file definitons ----------------------------------------
|
||||
#
|
||||
|
||||
get-st-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_st.o)))))))
|
||||
|
||||
# Build a list of object files and binaries for each single-threaded
|
||||
# implementation using the get-st-objs() function defined above.
|
||||
BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup)
|
||||
BLISSUP_ST_BINS := $(patsubst %.o,%.x,$(BLISSUP_ST_OBJS))
|
||||
|
||||
BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab)
|
||||
BLISLPAB_ST_BINS := $(patsubst %.o,%.x,$(BLISLPAB_ST_OBJS))
|
||||
|
||||
EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen)
|
||||
EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
|
||||
|
||||
OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas)
|
||||
OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
|
||||
|
||||
BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo)
|
||||
BLASFEO_ST_BINS := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS))
|
||||
|
||||
LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm)
|
||||
LIBXSMM_ST_BINS := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS))
|
||||
|
||||
VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor)
|
||||
VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
|
||||
|
||||
# Mark the object files as intermediate so that make will remove them
|
||||
# automatically after building the binaries on which they depend.
|
||||
.INTERMEDIATE: $(BLISSUP_ST_OBJS) \
|
||||
$(BLISLPAB_ST_OBJS) \
|
||||
$(EIGEN_ST_OBJS) \
|
||||
$(OPENBLAS_ST_OBJS) \
|
||||
$(BLASFEO_ST_OBJS) \
|
||||
$(LIBXSMM_ST_OBJS) \
|
||||
$(VENDOR_ST_OBJS)
|
||||
|
||||
get-mt-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_mt.o)))))))
|
||||
|
||||
# Build a list of object files and binaries for each multithreaded
|
||||
# implementation using the get-st-objs() function defined above.
|
||||
BLISSUP_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup)
|
||||
BLISSUP_MT_BINS := $(patsubst %.o,%.x,$(BLISSUP_MT_OBJS))
|
||||
|
||||
BLISLPAB_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab)
|
||||
BLISLPAB_MT_BINS := $(patsubst %.o,%.x,$(BLISLPAB_MT_OBJS))
|
||||
|
||||
EIGEN_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen)
|
||||
EIGEN_MT_BINS := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS))
|
||||
|
||||
OPENBLAS_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas)
|
||||
OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS))
|
||||
|
||||
VENDOR_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor)
|
||||
VENDOR_MT_BINS := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS))
|
||||
|
||||
#$(error "objs = $(EIGEN_ST_BINS)" )
|
||||
|
||||
# Mark the object files as intermediate so that make will remove them
|
||||
# automatically after building the binaries on which they depend.
|
||||
.INTERMEDIATE: $(BLISSUP_MT_OBJS) \
|
||||
$(BLISLPAB_MT_OBJS) \
|
||||
$(EIGEN_MT_OBJS) \
|
||||
$(OPENBLAS_MT_OBJS) \
|
||||
$(VENDOR_MT_OBJS)
|
||||
|
||||
|
||||
#
|
||||
# --- Targets/rules ------------------------------------------------------------
|
||||
#
|
||||
|
||||
all: st
|
||||
|
||||
blis: blissup-st blislpab-st
|
||||
|
||||
blissup: blissup-st
|
||||
blislpab: blislpab-st
|
||||
eigen: eigen-st
|
||||
openblas: openblas-st
|
||||
blasfeo: blasfeo-st
|
||||
libxsmm: libxsmm-st
|
||||
vendor: vendor-st
|
||||
|
||||
st: blissup-st blislpab-st \
|
||||
eigen-st openblas-st blasfeo-st libxsmm-st vendor-st
|
||||
|
||||
blissup-st: $(BLISSUP_ST_BINS)
|
||||
blislpab-st: $(BLISLPAB_ST_BINS)
|
||||
eigen-st: $(EIGEN_ST_BINS)
|
||||
openblas-st: $(OPENBLAS_ST_BINS)
|
||||
blasfeo-st: $(BLASFEO_ST_BINS)
|
||||
libxsmm-st: $(LIBXSMM_ST_BINS)
|
||||
vendor-st: $(VENDOR_ST_BINS)
|
||||
|
||||
mt: blissup-mt blislpab-mt \
|
||||
eigen-mt openblas-mt vendor-mt
|
||||
|
||||
blissup-mt: $(BLISSUP_MT_BINS)
|
||||
blislpab-mt: $(BLISLPAB_MT_BINS)
|
||||
eigen-mt: $(EIGEN_MT_BINS)
|
||||
openblas-mt: $(OPENBLAS_MT_BINS)
|
||||
vendor-mt: $(VENDOR_MT_BINS)
|
||||
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
|
||||
# Define the implementations for which we will instantiate compilation rules.
|
||||
BIMPLS_ST := blissup blislpab openblas blasfeo libxsmm vendor
|
||||
BIMPLS_MT := blissup blislpab openblas vendor
|
||||
EIMPLS := eigen
|
||||
|
||||
# 1 2 3 4 567 8
|
||||
# test_dgemm_nn_rrr_mpn6kp_blissup_st.x
|
||||
|
||||
# Define the function that will be used to instantiate compilation rules
|
||||
# for the various single-threaded implementations.
|
||||
define make-st-rule
|
||||
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile
|
||||
$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
# Instantiate the rule function make-st-rule() for each BLIS/BLAS/CBLAS
|
||||
# implementation.
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach tr,$(TRANS), \
|
||||
$(foreach st,$(STORS), \
|
||||
$(foreach sh,$(SHAPES), \
|
||||
$(foreach sm,$(SMS), \
|
||||
$(foreach sn,$(SNS), \
|
||||
$(foreach sk,$(SKS), \
|
||||
$(foreach impl,$(BIMPLS_ST), \
|
||||
$(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
|
||||
|
||||
# Define the function that will be used to instantiate compilation rules
|
||||
# for the various multithreaded implementations.
|
||||
define make-mt-rule
|
||||
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
|
||||
$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
# Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS
|
||||
# implementation.
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach tr,$(TRANS), \
|
||||
$(foreach st,$(STORS), \
|
||||
$(foreach sh,$(SHAPES), \
|
||||
$(foreach sm,$(SMS), \
|
||||
$(foreach sn,$(SNS), \
|
||||
$(foreach sk,$(SKS), \
|
||||
$(foreach impl,$(BIMPLS_MT), \
|
||||
$(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
|
||||
|
||||
# Define the function that will be used to instantiate compilation rules
|
||||
# for the single-threaded Eigen implementation.
|
||||
define make-eigst-rule
|
||||
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile
|
||||
$(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
# Instantiate the rule function make-st-rule() for each Eigen implementation.
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach tr,$(TRANS), \
|
||||
$(foreach st,$(STORS), \
|
||||
$(foreach sh,$(SHAPES), \
|
||||
$(foreach sm,$(SMS), \
|
||||
$(foreach sn,$(SNS), \
|
||||
$(foreach sk,$(SKS), \
|
||||
$(foreach impl,$(EIMPLS), \
|
||||
$(eval $(call make-eigst-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
|
||||
|
||||
# Define the function that will be used to instantiate compilation rules
|
||||
# for the multithreaded Eigen implementation.
|
||||
define make-eigmt-rule
|
||||
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
|
||||
$(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
# Instantiate the rule function make-st-rule() for each Eigen implementation.
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach tr,$(TRANS), \
|
||||
$(foreach st,$(STORS), \
|
||||
$(foreach sh,$(SHAPES), \
|
||||
$(foreach sm,$(SMS), \
|
||||
$(foreach sn,$(SNS), \
|
||||
$(foreach sk,$(SKS), \
|
||||
$(foreach impl,$(EIMPLS), \
|
||||
$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
|
||||
|
||||
|
||||
# -- Executable file rules --
|
||||
|
||||
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
|
||||
# on the link command line in case BLIS was configured with the BLAS
|
||||
# compatibility layer. This prevents BLIS from inadvertently getting called
|
||||
# for the BLAS routines we are trying to test with.
|
||||
|
||||
test_%_blissup_st.x: test_%_blissup_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_blislpab_st.x: test_%_blislpab_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_eigen_st.x: test_%_eigen_st.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_blasfeo_st.x: test_%_blasfeo_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(BLASFEO_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_libxsmm_st.x: test_%_libxsmm_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBXSMM_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
test_%_blissup_mt.x: test_%_blissup_mt.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_blislpab_mt.x: test_%_blislpab_mt.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_eigen_mt.x: test_%_eigen_mt.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_vendor_mt.x: test_%_vendor_mt.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
# -- Clean rules --
|
||||
|
||||
clean: cleanx
|
||||
|
||||
cleanx:
|
||||
- $(RM_F) *.x *.o
|
||||
|
||||
52
test/supmt/octave/gen_opsupnames.m
Normal file
52
test/supmt/octave/gen_opsupnames.m
Normal file
@@ -0,0 +1,52 @@
|
||||
function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims )
|
||||
|
||||
nops = size( ops, 1 );
|
||||
|
||||
smallm = smalldims( 1 );
|
||||
smalln = smalldims( 2 );
|
||||
smallk = smalldims( 3 );
|
||||
|
||||
i = 1;
|
||||
|
||||
for io = 1:nops
|
||||
|
||||
op = ops( io, : );
|
||||
|
||||
str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm );
|
||||
str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln );
|
||||
str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk );
|
||||
str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk );
|
||||
str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk );
|
||||
str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln );
|
||||
str6 = sprintf( '%s_%s_mpnpkp', op, stor );
|
||||
|
||||
%opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm )
|
||||
%opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln )
|
||||
%opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk )
|
||||
%opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk )
|
||||
%opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk )
|
||||
%opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln )
|
||||
%opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor )
|
||||
|
||||
opsupnames( i+0, : ) = sprintf( '%-20s', str0 );
|
||||
opsupnames( i+1, : ) = sprintf( '%-20s', str1 );
|
||||
opsupnames( i+2, : ) = sprintf( '%-20s', str2 );
|
||||
opsupnames( i+3, : ) = sprintf( '%-20s', str3 );
|
||||
opsupnames( i+4, : ) = sprintf( '%-20s', str4 );
|
||||
opsupnames( i+5, : ) = sprintf( '%-20s', str5 );
|
||||
opsupnames( i+6, : ) = sprintf( '%-20s', str6 );
|
||||
|
||||
opnames( i+0, : ) = sprintf( '%s', op );
|
||||
opnames( i+1, : ) = sprintf( '%s', op );
|
||||
opnames( i+2, : ) = sprintf( '%s', op );
|
||||
opnames( i+3, : ) = sprintf( '%s', op );
|
||||
opnames( i+4, : ) = sprintf( '%s', op );
|
||||
opnames( i+5, : ) = sprintf( '%s', op );
|
||||
opnames( i+6, : ) = sprintf( '%s', op );
|
||||
|
||||
i = i + 7;
|
||||
end
|
||||
|
||||
r_val1 = opsupnames;
|
||||
r_val2 = opnames;
|
||||
|
||||
250
test/supmt/octave/plot_l3sup_perf.m
Normal file
250
test/supmt/octave/plot_l3sup_perf.m
Normal file
@@ -0,0 +1,250 @@
|
||||
function r_val = plot_l3sup_perf( opname, ...
|
||||
data_blissup, ...
|
||||
data_blislpab, ...
|
||||
data_eigen, ...
|
||||
data_open, ...
|
||||
data_vend, vend_str, ...
|
||||
nth, ...
|
||||
rows, cols, ...
|
||||
cfreq, ...
|
||||
dfps, ...
|
||||
theid, impl )
|
||||
%if ... %mod(theid-1,cols) == 2 || ...
|
||||
% ... %mod(theid-1,cols) == 3 || ...
|
||||
% ... %mod(theid-1,cols) == 4 || ...
|
||||
% 0 == 1 ... %theid >= 19
|
||||
% show_plot = 0;
|
||||
%else
|
||||
show_plot = 1;
|
||||
%end
|
||||
|
||||
%legend_plot_id = 11;
|
||||
legend_plot_id = 1*cols + 1*5;
|
||||
|
||||
if 1
|
||||
ax1 = subplot( rows, cols, theid );
|
||||
hold( ax1, 'on' );
|
||||
end
|
||||
|
||||
% Set line properties.
|
||||
color_blissup = 'k'; lines_blissup = '-'; markr_blissup = '';
|
||||
color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = '';
|
||||
color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o';
|
||||
color_open = 'r'; lines_open = '--'; markr_open = 'o';
|
||||
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
|
||||
|
||||
% Compute the peak performance in terms of the number of double flops
|
||||
% executable per cycle and the clock rate.
|
||||
if opname(1) == 's' || opname(1) == 'c'
|
||||
flopspercycle = dfps * 2;
|
||||
else
|
||||
flopspercycle = dfps;
|
||||
end
|
||||
max_perf_core = (flopspercycle * cfreq) * 1;
|
||||
|
||||
% Escape underscores in the title.
|
||||
title_opname = strrep( opname, '_', '\_' );
|
||||
|
||||
% Print the title to a string.
|
||||
titlename = '%s';
|
||||
titlename = sprintf( titlename, title_opname );
|
||||
|
||||
% Set the legend strings.
|
||||
blissup_legend = sprintf( 'BLIS sup' );
|
||||
blislpab_legend = sprintf( 'BLIS conv' );
|
||||
eigen_legend = sprintf( 'Eigen' );
|
||||
open_legend = sprintf( 'OpenBLAS' );
|
||||
%vend_legend = sprintf( 'MKL' );
|
||||
%vend_legend = sprintf( 'ARMPL' );
|
||||
vend_legend = vend_str;
|
||||
|
||||
% Set axes range values.
|
||||
y_scale = 1.00;
|
||||
x_begin = 0;
|
||||
%x_end is set below.
|
||||
y_begin = 0;
|
||||
y_end = max_perf_core * y_scale;
|
||||
|
||||
% Set axes names.
|
||||
if nth == 1
|
||||
yaxisname = 'GFLOPS';
|
||||
else
|
||||
yaxisname = 'GFLOPS/core';
|
||||
end
|
||||
|
||||
|
||||
%flopscol = 4;
|
||||
flopscol = size( data_blissup, 2 );
|
||||
msize = 5;
|
||||
if 1
|
||||
fontsize = 11;
|
||||
else
|
||||
fontsize = 16;
|
||||
end
|
||||
linesize = 0.5;
|
||||
legend_loc = 'southeast';
|
||||
|
||||
% --------------------------------------------------------------------
|
||||
|
||||
% Automatically detect a column with the increasing problem size.
|
||||
% Then set the maximum x-axis value.
|
||||
for psize_col = 1:3
|
||||
if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col )
|
||||
break;
|
||||
end
|
||||
end
|
||||
x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
|
||||
% Compute the number of data points we have in the x-axis. Note that
|
||||
% we only use quarter the data points for the m = n = k column of graphs.
|
||||
if mod(theid-1,cols) == 6
|
||||
np = size( data_blissup, 1 ) / 4;
|
||||
else
|
||||
np = size( data_blissup, 1 );
|
||||
end
|
||||
|
||||
% Grab the last x-axis value.
|
||||
x_end = data_blissup( np, psize_col );
|
||||
|
||||
%data_peak( 1, 1:2 ) = [ 0 max_perf_core ];
|
||||
%data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
|
||||
|
||||
if show_plot == 1
|
||||
blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
|
||||
'LineWidth',linesize );
|
||||
eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_eigen, 'LineStyle',lines_eigen, ...
|
||||
'LineWidth',linesize );
|
||||
open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_open, 'LineStyle',lines_open, ...
|
||||
'LineWidth',linesize );
|
||||
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
else
|
||||
if theid == legend_plot_id
|
||||
blissup_ln = line( nan, nan, ...
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
blislpab_ln = line( nan, nan, ...
|
||||
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
|
||||
'LineWidth',linesize );
|
||||
eigen_ln = line( nan, nan, ...
|
||||
'Color',color_eigen, 'LineStyle',lines_eigen, ...
|
||||
'LineWidth',linesize );
|
||||
open_ln = line( nan, nan, ...
|
||||
'Color',color_open, 'LineStyle',lines_open, ...
|
||||
'LineWidth',linesize );
|
||||
vend_ln = line( nan, nan, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
xlim( ax1, [x_begin x_end] );
|
||||
ylim( ax1, [y_begin y_end] );
|
||||
|
||||
if 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 4000 <= x_end && x_end < 6000
|
||||
x_tick2 = x_end - 1000;
|
||||
x_tick1 = x_tick2/2;
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 2000 <= x_end && x_end < 3000
|
||||
x_tick2 = x_end - 400;
|
||||
x_tick1 = x_tick2/2;
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 500 <= x_end && x_end < 1000
|
||||
x_tick3 = x_end*(3/4);
|
||||
x_tick2 = x_end*(2/4);
|
||||
x_tick1 = x_end*(1/4);
|
||||
xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] );
|
||||
end
|
||||
|
||||
if show_plot == 1 || theid == legend_plot_id
|
||||
if theid == legend_plot_id
|
||||
leg = legend( ...
|
||||
[ ...
|
||||
blissup_ln ...
|
||||
blislpab_ln ...
|
||||
eigen_ln ...
|
||||
open_ln ...
|
||||
vend_ln ...
|
||||
], ...
|
||||
blissup_legend, ...
|
||||
blislpab_legend, ...
|
||||
eigen_legend, ...
|
||||
open_legend, ...
|
||||
vend_legend, ...
|
||||
'Location', legend_loc );
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
set( leg,'Position',[12.50 10.35 1.5 0.9 ] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
|
||||
end
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
% xpos ypos
|
||||
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
|
||||
end
|
||||
end
|
||||
|
||||
set( ax1,'FontSize',fontsize );
|
||||
set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1.
|
||||
box( ax1, 'on' );
|
||||
|
||||
titl = title( titlename );
|
||||
set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
|
||||
|
||||
if impl == 'octave'
|
||||
tpos = get( titl, 'Position' ); % default is to align across whole figure, not box.
|
||||
tpos(1) = tpos(1) + -40;
|
||||
set( titl, 'Position', tpos ); % here we nudge it back to centered with box.
|
||||
end
|
||||
|
||||
if theid > (rows-1)*cols
|
||||
%xlab = xlabel( ax1,xaxisname );
|
||||
%tpos = get( xlab, 'Position' )
|
||||
%tpos(2) = tpos(2) + 10;
|
||||
%set( xlab, 'Position', tpos );
|
||||
if theid == rows*cols - 6
|
||||
xlab = xlabel( ax1, 'm = 6; n = k' );
|
||||
elseif theid == rows*cols - 5
|
||||
xlab = xlabel( ax1, 'n = 8; m = k' );
|
||||
elseif theid == rows*cols - 4
|
||||
xlab = xlabel( ax1, 'k = 10; m = n' );
|
||||
elseif theid == rows*cols - 3
|
||||
xlab = xlabel( ax1, 'm; n = 8, k = 10' );
|
||||
elseif theid == rows*cols - 2
|
||||
xlab = xlabel( ax1, 'n; m = 6, k = 10' );
|
||||
elseif theid == rows*cols - 1
|
||||
xlab = xlabel( ax1, 'k; m = 6, n = 8' );
|
||||
elseif theid == rows*cols - 0
|
||||
xlab = xlabel( ax1, 'm = n = k' );
|
||||
end
|
||||
end
|
||||
|
||||
if mod(theid-1,cols) == 0
|
||||
ylab = ylabel( ax1,yaxisname );
|
||||
end
|
||||
|
||||
%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' );
|
||||
%saveas( fig, filename_png );
|
||||
|
||||
%hold( ax1, 'off' );
|
||||
|
||||
r_val = 0;
|
||||
|
||||
152
test/supmt/octave/plot_panel_trxsh.m
Normal file
152
test/supmt/octave/plot_panel_trxsh.m
Normal file
@@ -0,0 +1,152 @@
|
||||
function r_val = plot_panel_trxsh ...
|
||||
( ...
|
||||
cfreq, ...
|
||||
dflopspercycle, ...
|
||||
nth, ...
|
||||
thr_str, ...
|
||||
dt_ch, ...
|
||||
stor_str, ...
|
||||
smalldims, ...
|
||||
dirpath, ...
|
||||
arch_str, ...
|
||||
vend_str, ...
|
||||
impl ...
|
||||
)
|
||||
|
||||
%cfreq = 1.8;
|
||||
%dflopspercycle = 32;
|
||||
|
||||
% Create filename "templates" for the files that contain the performance
|
||||
% results.
|
||||
filetemp_blissup = '%s/output_%s_%s_blissup.m';
|
||||
filetemp_blislpab = '%s/output_%s_%s_blislpab.m';
|
||||
filetemp_eigen = '%s/output_%s_%s_eigen.m';
|
||||
filetemp_open = '%s/output_%s_%s_openblas.m';
|
||||
filetemp_vend = '%s/output_%s_%s_vendor.m';
|
||||
|
||||
% Create a variable name "template" for the variables contained in the
|
||||
% files outlined above.
|
||||
vartemp = 'data_%s_%s_%s( :, : )';
|
||||
|
||||
% Define the datatypes and operations we will be plotting.
|
||||
oproot = sprintf( '%cgemm', dt_ch );
|
||||
ops( 1, : ) = sprintf( '%s_nn', oproot );
|
||||
ops( 2, : ) = sprintf( '%s_nt', oproot );
|
||||
ops( 3, : ) = sprintf( '%s_tn', oproot );
|
||||
ops( 4, : ) = sprintf( '%s_tt', oproot );
|
||||
|
||||
% Generate datatype-specific operation names from the set of operations
|
||||
% and datatypes.
|
||||
[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims );
|
||||
n_opsupnames = size( opsupnames, 1 );
|
||||
|
||||
%opsupnames
|
||||
%opnames
|
||||
%return
|
||||
|
||||
if 1 == 1
|
||||
%fig = figure('Position', [100, 100, 2400, 1500]);
|
||||
fig = figure('Position', [100, 100, 2400, 1200]);
|
||||
orient( fig, 'portrait' );
|
||||
set(gcf,'PaperUnits', 'inches');
|
||||
if impl == 'matlab'
|
||||
set(gcf,'PaperSize', [11.5 20.4]);
|
||||
set(gcf,'PaperPosition', [0 0 11.5 20.4]);
|
||||
set(gcf,'PaperPositionMode','manual');
|
||||
else % impl == 'octave' % octave 4.x
|
||||
set(gcf,'PaperSize', [12 21.5]);
|
||||
set(gcf,'PaperPositionMode','auto');
|
||||
end
|
||||
set(gcf,'PaperOrientation','landscape');
|
||||
end
|
||||
|
||||
|
||||
% Iterate over the list of datatype-specific operation names.
|
||||
for opi = 1:n_opsupnames
|
||||
%for opi = 1:1
|
||||
|
||||
% Grab the current datatype combination.
|
||||
opsupname = opsupnames( opi, : );
|
||||
opname = opnames( opi, : );
|
||||
|
||||
opsupname = strtrim( opsupname );
|
||||
opname = strtrim( opname );
|
||||
|
||||
str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str);
|
||||
|
||||
% Construct filenames for the data files from templates.
|
||||
file_blissup = sprintf( filetemp_blissup, dirpath, thr_str, opsupname );
|
||||
file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname );
|
||||
file_eigen = sprintf( filetemp_eigen, dirpath, thr_str, opsupname );
|
||||
file_open = sprintf( filetemp_open, dirpath, thr_str, opsupname );
|
||||
file_vend = sprintf( filetemp_vend, dirpath, thr_str, opsupname );
|
||||
|
||||
% Load the data files.
|
||||
%str = sprintf( ' Loading %s', file_blissup ); disp(str);
|
||||
run( file_blissup )
|
||||
run( file_blislpab )
|
||||
run( file_eigen )
|
||||
run( file_open )
|
||||
run( file_vend )
|
||||
|
||||
% Construct variable names for the variables in the data files.
|
||||
var_blissup = sprintf( vartemp, thr_str, opname, 'blissup' );
|
||||
var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' );
|
||||
var_eigen = sprintf( vartemp, thr_str, opname, 'eigen' );
|
||||
var_open = sprintf( vartemp, thr_str, opname, 'openblas' );
|
||||
var_vend = sprintf( vartemp, thr_str, opname, 'vendor' );
|
||||
|
||||
% Use eval() to instantiate the variable names constructed above,
|
||||
% copying each to a simplified name.
|
||||
data_blissup = eval( var_blissup ); % e.g. data_st_dgemm_blissup( :, : );
|
||||
data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : );
|
||||
data_eigen = eval( var_eigen ); % e.g. data_st_dgemm_eigen( :, : );
|
||||
data_open = eval( var_open ); % e.g. data_st_dgemm_openblas( :, : );
|
||||
data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : );
|
||||
|
||||
%str = sprintf( ' Reading %s', var_blissup ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_blislpab ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_eigen ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_open ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_bfeo ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_xsmm ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_vend ); disp(str);
|
||||
|
||||
% Plot one result in an m x n grid of plots, via the subplot()
|
||||
% function.
|
||||
if 1 == 1
|
||||
plot_l3sup_perf( opsupname, ...
|
||||
data_blissup, ...
|
||||
data_blislpab, ...
|
||||
data_eigen, ...
|
||||
data_open, ...
|
||||
data_vend, vend_str, ...
|
||||
nth, ...
|
||||
4, 7, ...
|
||||
cfreq, ...
|
||||
dflopspercycle, ...
|
||||
opi, impl );
|
||||
|
||||
clear data_mt_*gemm_*;
|
||||
clear data_blissup;
|
||||
clear data_blislpab;
|
||||
clear data_eigen;
|
||||
clear data_open;
|
||||
clear data_vend;
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
% Construct the name of the file to which we will output the graph.
|
||||
outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth );
|
||||
|
||||
% Output the graph to pdf format.
|
||||
%print(gcf, 'gemm_md','-fillpage','-dpdf');
|
||||
%print(gcf, outfile,'-bestfit','-dpdf');
|
||||
if impl == 'octave'
|
||||
print(gcf, outfile);
|
||||
else % if impl == 'matlab'
|
||||
print(gcf, outfile,'-bestfit','-dpdf');
|
||||
end
|
||||
|
||||
12
test/supmt/octave/runthese.m
Normal file
12
test/supmt/octave/runthese.m
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
% haswell
|
||||
plot_panel_trxsh(3.25,16,1,'mt','d','ccc',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
|
||||
plot_panel_trxsh(3.25,16,1,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
|
||||
|
||||
% kabylake
|
||||
plot_panel_trxsh(3.80,16,1,'mt','d','rrr',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all;
|
||||
plot_panel_trxsh(3.80,16,1,'mt','d','ccc',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all;
|
||||
|
||||
% epyc
|
||||
plot_panel_trxsh(3.00, 8,1,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
|
||||
plot_panel_trxsh(3.00, 8,1,'mt','d','ccc',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
|
||||
188
test/supmt/runme.sh
Executable file
188
test/supmt/runme.sh
Executable file
@@ -0,0 +1,188 @@
|
||||
#!/bin/bash
|
||||
|
||||
# File pefixes.
|
||||
exec_root="test"
|
||||
out_root="output"
|
||||
|
||||
sys="blis"
|
||||
#sys="lonestar5"
|
||||
#sys="ul252"
|
||||
#sys="ul264"
|
||||
|
||||
if [ ${sys} = "blis" ]; then
|
||||
|
||||
export GOMP_CPU_AFFINITY="0-3"
|
||||
nt=4
|
||||
|
||||
elif [ ${sys} = "lonestar5" ]; then
|
||||
|
||||
export GOMP_CPU_AFFINITY="0-23"
|
||||
nt=24
|
||||
|
||||
elif [ ${sys} = "ul252" ]; then
|
||||
|
||||
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
|
||||
export GOMP_CPU_AFFINITY="0-51"
|
||||
nt=52
|
||||
|
||||
elif [ ${sys} = "ul264" ]; then
|
||||
|
||||
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
|
||||
export GOMP_CPU_AFFINITY="0-63"
|
||||
nt=64
|
||||
|
||||
fi
|
||||
|
||||
# Delay between test cases.
|
||||
delay=0.02
|
||||
|
||||
# Threadedness to test.
|
||||
threads="mt"
|
||||
|
||||
# Datatypes to test.
|
||||
#dts="d s"
|
||||
dts="d"
|
||||
|
||||
# Operations to test.
|
||||
ops="gemm"
|
||||
|
||||
# Transpose combintions to test.
|
||||
trans="nn nt tn tt"
|
||||
|
||||
# Storage combinations to test.
|
||||
#stors="rrr rrc rcr rcc crr crc ccr ccc"
|
||||
stors="rrr ccc"
|
||||
|
||||
# Problem shapes to test.
|
||||
shapes="sll lsl lls lss sls ssl lll"
|
||||
|
||||
# FGVZ: figure out how to probe what's in the directory and
|
||||
# execute everything that's there?
|
||||
sms="6"
|
||||
sns="8"
|
||||
sks="10"
|
||||
|
||||
# Implementations to test.
|
||||
impls="vendor blissup blislpab openblas eigen"
|
||||
#impls="vendor"
|
||||
#impls="blissup"
|
||||
#impls="blislpab"
|
||||
#impls="openblas"
|
||||
#impls="eigen"
|
||||
|
||||
# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
|
||||
# restore the value.
|
||||
GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
|
||||
|
||||
# Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x
|
||||
|
||||
for th in ${threads}; do
|
||||
|
||||
for dt in ${dts}; do
|
||||
|
||||
for op in ${ops}; do
|
||||
|
||||
for tr in ${trans}; do
|
||||
|
||||
for st in ${stors}; do
|
||||
|
||||
for sh in ${shapes}; do
|
||||
|
||||
for sm in ${sms}; do
|
||||
|
||||
for sn in ${sns}; do
|
||||
|
||||
for sk in ${sks}; do
|
||||
|
||||
for im in ${impls}; do
|
||||
|
||||
if [ "${im:0:4}" = "blis" ]; then
|
||||
unset OMP_NUM_THREADS
|
||||
export BLIS_NUM_THREADS=${nt}
|
||||
elif [ "${im}" = "openblas" ]; then
|
||||
unset OMP_NUM_THREADS
|
||||
export OPENBLAS_NUM_THREADS=${nt}
|
||||
elif [ "${im}" = "eigen" ]; then
|
||||
export OMP_NUM_THREADS=${nt}
|
||||
elif [ "${im}" = "vendor" ]; then
|
||||
unset OMP_NUM_THREADS
|
||||
export MKL_NUM_THREADS=${nt}
|
||||
fi
|
||||
|
||||
# Multithreaded OpenBLAS seems to have a problem
|
||||
# running properly if GOMP_CPU_AFFINITY is set.
|
||||
# So we temporarily unset it here if we are about
|
||||
# to execute OpenBLAS, but otherwise restore it.
|
||||
if [ ${im} = "openblas" ]; then
|
||||
unset GOMP_CPU_AFFINITY
|
||||
else
|
||||
export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
|
||||
fi
|
||||
|
||||
# Limit execution of non-BLIS implementations to
|
||||
# rrr/ccc storage cases.
|
||||
if [ "${im:0:4}" != "blis" ] && \
|
||||
[ "${st}" != "rrr" ] && \
|
||||
[ "${st}" != "ccc" ]; then
|
||||
continue;
|
||||
fi
|
||||
|
||||
# Further limit execution of libxsmm to
|
||||
# ccc storage cases.
|
||||
if [ "${im:0:7}" = "libxsmm" ] && \
|
||||
[ "${st}" != "ccc" ]; then
|
||||
continue;
|
||||
fi
|
||||
|
||||
# Extract the shape chars for m, n, k.
|
||||
chm=${sh:0:1}
|
||||
chn=${sh:1:1}
|
||||
chk=${sh:2:1}
|
||||
|
||||
# Construct the shape substring (e.g. m6npkp)
|
||||
shstr=""
|
||||
|
||||
if [ ${chm} = "s" ]; then
|
||||
shstr="${shstr}m${sm}"
|
||||
else
|
||||
shstr="${shstr}mp"
|
||||
fi
|
||||
|
||||
if [ ${chn} = "s" ]; then
|
||||
shstr="${shstr}n${sn}"
|
||||
else
|
||||
shstr="${shstr}np"
|
||||
fi
|
||||
|
||||
if [ ${chk} = "s" ]; then
|
||||
shstr="${shstr}k${sk}"
|
||||
else
|
||||
shstr="${shstr}kp"
|
||||
fi
|
||||
|
||||
# Ex: test_dgemm_nn_rrc_m6npkp_blissup_st.x
|
||||
|
||||
# Construct the name of the test executable.
|
||||
exec_name="${exec_root}_${dt}${op}_${tr}_${st}_${shstr}_${im}_${th}.x"
|
||||
|
||||
# Construct the name of the output file.
|
||||
out_file="${out_root}_${th}_${dt}${op}_${tr}_${st}_${shstr}_${im}.m"
|
||||
|
||||
echo "Running (nt = ${nt}) ./${exec_name} > ${out_file}"
|
||||
|
||||
# Run executable.
|
||||
./${exec_name} > ${out_file}
|
||||
|
||||
sleep ${delay}
|
||||
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
589
test/supmt/test_gemm.c
Normal file
589
test/supmt/test_gemm.c
Normal file
@@ -0,0 +1,589 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#ifdef EIGEN
|
||||
#define BLIS_DISABLE_BLAS_DEFS
|
||||
#include "blis.h"
|
||||
#include <Eigen/Core>
|
||||
//#include <Eigen/src/misc/blas.h>
|
||||
using namespace Eigen;
|
||||
#else
|
||||
#include "blis.h"
|
||||
#endif
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
rntm_t rntm_g;
|
||||
|
||||
bli_init();
|
||||
|
||||
// Copy the global rntm_t object in case we need it later when disabling
|
||||
// sup.
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
|
||||
#ifndef ERROR_CHECK
|
||||
bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
#endif
|
||||
|
||||
|
||||
dim_t n_trials = N_TRIALS;
|
||||
|
||||
num_t dt = DT;
|
||||
|
||||
#if 1
|
||||
dim_t p_begin = P_BEGIN;
|
||||
dim_t p_max = P_MAX;
|
||||
dim_t p_inc = P_INC;
|
||||
#else
|
||||
dim_t p_begin = 4;
|
||||
dim_t p_max = 40;
|
||||
dim_t p_inc = 4;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
dim_t m_input = M_DIM;
|
||||
dim_t n_input = N_DIM;
|
||||
dim_t k_input = K_DIM;
|
||||
#else
|
||||
p_begin = p_inc = 32;
|
||||
dim_t m_input = 6;
|
||||
dim_t n_input = -1;
|
||||
dim_t k_input = -1;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
trans_t transa = TRANSA;
|
||||
trans_t transb = TRANSB;
|
||||
#else
|
||||
trans_t transa = BLIS_NO_TRANSPOSE;
|
||||
trans_t transb = BLIS_NO_TRANSPOSE;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
stor3_t sc = STOR3;
|
||||
#else
|
||||
stor3_t sc = BLIS_RRR;
|
||||
#endif
|
||||
|
||||
|
||||
inc_t rs_c, cs_c;
|
||||
inc_t rs_a, cs_a;
|
||||
inc_t rs_b, cs_b;
|
||||
|
||||
if ( sc == BLIS_RRR ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = -1; }
|
||||
else if ( sc == BLIS_RRC ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = 0; }
|
||||
else if ( sc == BLIS_RCR ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = -1; }
|
||||
else if ( sc == BLIS_RCC ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = 0; }
|
||||
else if ( sc == BLIS_CRR ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = -1; }
|
||||
else if ( sc == BLIS_CRC ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = 0; }
|
||||
else if ( sc == BLIS_CCR ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = -1; }
|
||||
else if ( sc == BLIS_CCC ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = 0; }
|
||||
else { bli_abort(); }
|
||||
|
||||
f77_int cbla_storage;
|
||||
|
||||
if ( sc == BLIS_RRR ) cbla_storage = CblasRowMajor;
|
||||
else if ( sc == BLIS_CCC ) cbla_storage = CblasColMajor;
|
||||
else cbla_storage = -1;
|
||||
|
||||
( void )cbla_storage;
|
||||
|
||||
|
||||
char dt_ch;
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
char transal, transbl;
|
||||
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
transal = tolower( f77_transa );
|
||||
transbl = tolower( f77_transb );
|
||||
|
||||
f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans );
|
||||
f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans );
|
||||
|
||||
( void )cbla_transa;
|
||||
( void )cbla_transb;
|
||||
|
||||
dim_t p;
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
|
||||
|
||||
printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
|
||||
transal, transbl, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
//for ( p = p_begin; p <= p_max; p += p_inc )
|
||||
for ( p = p_max; p_begin <= p; p -= p_inc )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, n, k;
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
bli_obj_create( dt, m, n, rs_c, cs_c, &c );
|
||||
bli_obj_create( dt, m, n, rs_c, cs_c, &c_save );
|
||||
|
||||
if ( bli_does_notrans( transa ) )
|
||||
bli_obj_create( dt, m, k, rs_a, cs_a, &a );
|
||||
else
|
||||
bli_obj_create( dt, k, m, rs_a, cs_a, &a );
|
||||
|
||||
if ( bli_does_notrans( transb ) )
|
||||
bli_obj_create( dt, k, n, rs_b, cs_b, &b );
|
||||
else
|
||||
bli_obj_create( dt, n, k, rs_b, cs_b, &b );
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
bli_setsc( (1.0/1.0), 0.0, &alpha );
|
||||
bli_setsc( (1.0/1.0), 0.0, &beta );
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#ifdef EIGEN
|
||||
double alpha_r, alpha_i;
|
||||
|
||||
bli_getsc( &alpha, &alpha_r, &alpha_i );
|
||||
|
||||
void* ap = bli_obj_buffer_at_off( &a );
|
||||
void* bp = bli_obj_buffer_at_off( &b );
|
||||
void* cp = bli_obj_buffer_at_off( &c );
|
||||
|
||||
const int os_a = ( bli_obj_is_col_stored( &a ) ? bli_obj_col_stride( &a )
|
||||
: bli_obj_row_stride( &a ) );
|
||||
const int os_b = ( bli_obj_is_col_stored( &b ) ? bli_obj_col_stride( &b )
|
||||
: bli_obj_row_stride( &b ) );
|
||||
const int os_c = ( bli_obj_is_col_stored( &c ) ? bli_obj_col_stride( &c )
|
||||
: bli_obj_row_stride( &c ) );
|
||||
|
||||
Stride<Dynamic,1> stride_a( os_a, 1 );
|
||||
Stride<Dynamic,1> stride_b( os_b, 1 );
|
||||
Stride<Dynamic,1> stride_c( os_c, 1 );
|
||||
|
||||
#if defined(IS_FLOAT)
|
||||
#elif defined (IS_DOUBLE)
|
||||
#ifdef A_STOR_R
|
||||
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_A;
|
||||
#else
|
||||
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_A;
|
||||
#endif
|
||||
#ifdef B_STOR_R
|
||||
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_B;
|
||||
#else
|
||||
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_B;
|
||||
#endif
|
||||
#ifdef C_STOR_R
|
||||
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_C;
|
||||
#else
|
||||
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_C;
|
||||
#endif
|
||||
|
||||
#ifdef A_NOTRANS // A is not transposed
|
||||
Map<MatrixXd_A, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
|
||||
#else // A is transposed
|
||||
Map<MatrixXd_A, 0, Stride<Dynamic,1> > A( ( double* )ap, k, m, stride_a );
|
||||
#endif
|
||||
|
||||
#ifdef B_NOTRANS // B is not transposed
|
||||
Map<MatrixXd_B, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
|
||||
#else // B is transposed
|
||||
Map<MatrixXd_B, 0, Stride<Dynamic,1> > B( ( double* )bp, n, k, stride_b );
|
||||
#endif
|
||||
|
||||
Map<MatrixXd_C, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
double dtime_save = DBL_MAX;
|
||||
|
||||
for ( dim_t r = 0; r < n_trials; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
double dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef EIGEN
|
||||
|
||||
#ifdef A_NOTRANS
|
||||
#ifdef B_NOTRANS
|
||||
C.noalias() += alpha_r * A * B;
|
||||
#else // B_TRANS
|
||||
C.noalias() += alpha_r * A * B.transpose();
|
||||
#endif
|
||||
#else // A_TRANS
|
||||
#ifdef B_NOTRANS
|
||||
C.noalias() += alpha_r * A.transpose() * B;
|
||||
#else // B_TRANS
|
||||
C.noalias() += alpha_r * A.transpose() * B.transpose();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#ifdef BLIS
|
||||
#ifdef SUP
|
||||
// Allow sup.
|
||||
bli_gemm( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
#else
|
||||
// Disable sup and use the expert interface.
|
||||
//rntm_t rntm = BLIS_RNTM_INITIALIZER;
|
||||
rntm_t rntm = rntm_g;
|
||||
bli_rntm_disable_l3_sup( &rntm );
|
||||
|
||||
bli_gemm_ex( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c, NULL, &rntm );
|
||||
#endif
|
||||
#endif
|
||||
#ifdef BLAS
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = ( float* )bli_obj_buffer( &alpha );
|
||||
float* ap = ( float* )bli_obj_buffer( &a );
|
||||
float* bp = ( float* )bli_obj_buffer( &b );
|
||||
float* betap = ( float* )bli_obj_buffer( &beta );
|
||||
float* cp = ( float* )bli_obj_buffer( &c );
|
||||
|
||||
#ifdef XSMM
|
||||
libxsmm_sgemm( &f77_transa,
|
||||
#else
|
||||
sgemm_( &f77_transa,
|
||||
#endif
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = ( double* )bli_obj_buffer( &alpha );
|
||||
double* ap = ( double* )bli_obj_buffer( &a );
|
||||
double* bp = ( double* )bli_obj_buffer( &b );
|
||||
double* betap = ( double* )bli_obj_buffer( &beta );
|
||||
double* cp = ( double* )bli_obj_buffer( &c );
|
||||
|
||||
#ifdef XSMM
|
||||
libxsmm_dgemm( &f77_transa,
|
||||
#else
|
||||
dgemm_( &f77_transa,
|
||||
#endif
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
|
||||
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
|
||||
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
|
||||
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
|
||||
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
|
||||
|
||||
#ifdef XSMM
|
||||
libxsmm_cgemm( &f77_transa,
|
||||
#else
|
||||
cgemm_( &f77_transa,
|
||||
#endif
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
|
||||
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
|
||||
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
|
||||
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
|
||||
|
||||
#ifdef XSMM
|
||||
libxsmm_zgemm( &f77_transa,
|
||||
#else
|
||||
zgemm_( &f77_transa,
|
||||
#endif
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
#ifdef CBLAS
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_sgemm( cbla_storage,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_dgemm( cbla_storage,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* bp = bli_obj_buffer( &b );
|
||||
scomplex* betap = bli_obj_buffer( &beta );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_cgemm( cbla_storage,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
betap,
|
||||
cp, ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* bp = bli_obj_buffer( &b );
|
||||
dcomplex* betap = bli_obj_buffer( &beta );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_zgemm( cbla_storage,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
betap,
|
||||
cp, ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
double gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
|
||||
transal, transbl, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )n,
|
||||
( unsigned long )k, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user