Support multithreading within the sup framework.

Details:
- Added multithreading support to the sup framework (via either OpenMP
  or pthreads). Both variants 1n and 2m now have the appropriate
  threading infrastructure, including data partitioning logic, to
  parallelize computation. This support handles all four combinations
  of packing on matrices A and B (neither, A only, B only, or both).
  This implementation tries to be a little smarter when automatic
  threading is requested (e.g. via BLIS_NUM_THREADS) in that it will
  recalculate the factorization in units of micropanels (rather than
  using the raw dimensions) in bli_l3_sup_int.c, when the final
  problem shape is known and after threads have already been spawned.
- Implemented bli_?packm_sup_var2(), which packs to conventional row-
  or column-stored matrices. (This is used for the rrc and crc storage
  cases.) Previously, copym was used, but that would no longer suffice
  because it could not be parallelized.
- Minor reorganization of packing-related sup functions. Specifically,
  bli_packm_sup_init_mem_[ab]() are called from within packm_sup_[ab]()
  instead of from the variant functions. This has the effect of making
  the variant functions more readable.
- Added additional bli_thrinfo_set_*() static functions to bli_thrinfo.h
  and inserted usage of these functions within bli_thrinfo_init(), which
  previously was accessing thrinfo_t fields via the -> operator.
- Renamed bli_partition_2x2() to bli_thread_partition_2x2().
- Added an auto_factor field to the rntm_t struct in order to track
  whether automatic thread factorization was originally requested.
- Added new test drivers in test/supmt that perform multithreaded sup
  tests, as well as appropriate octave/matlab scripts to plot the
  resulting output files.
- Added additional language to docs/Multithreading.md to make it clear
  that specifying any BLIS_*_NT variable, even if it is set to 1, will
  be considered manual specification for the purposes of determining
  whether to auto-factorize via BLIS_NUM_THREADS.
- Minor comment updates.
This commit is contained in:
Field G. Van Zee
2020-02-17 14:08:08 -06:00
parent d7a7679182
commit c0558fde45
40 changed files with 3831 additions and 674 deletions

View File

@@ -161,8 +161,8 @@ void bli_cntx_init_haswell( cntx_t* cntx )
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 201, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 201, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 201, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh

View File

@@ -171,8 +171,8 @@ void bli_cntx_init_zen( cntx_t* cntx )
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 220, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh

View File

@@ -109,7 +109,7 @@ Regardless of which method is employed, and which specific way within each metho
**Note**: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Here are the important points:
* Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the values set via the manual way will always take precedence.**
* Specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1).
* Specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1). And in the case of the environment variable method, setting the ways of parallelism for a loop to 1 counts as specifying parallelism! If you want to switch from using the manual way to automatic way, you must not only set (`export`) the `BLIS_NUM_THREADS` variable, but you must also `unset` all of the `BLIS_*_NT` variables.
* If you have specified multithreading via *both* the automatic and manual ways, BLIS will **not** complain if the values are inconsistent with one another. (For example, you may request 8 total threads be used while also specifing 4 ways of parallelism within each of two matrix multiplication loops, for a total of 16 ways.) Furthermore, you will be able to query these inconsistent values via the runtime API both before and after multithreading executes.
* If multithreading is disabled, you **may still** specify multithreading values via either the manual or automatic ways. However, BLIS will silently ignore **all** of these values. A BLIS library that is built with multithreading disabled at configure-time will always run sequentially (from the prespective of a single application thread).

View File

@@ -43,7 +43,6 @@ err_t bli_gemmsup_int
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
@@ -89,45 +88,86 @@ err_t bli_gemmsup_int
stor_id == BLIS_CRR );
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const num_t dt = bli_obj_dt( c );
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool_t auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
bool_t use_bp = TRUE;
dim_t jc_new;
dim_t ic_new;
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t mu = m / MR;
const dim_t nu = n / NR;
if ( mu >= nu )
//if ( m % 2 == 1 && n % 2 == 1 )
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
stor_id, cntx, rntm, thread );
}
else // if ( mu < nu )
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n primary\n" );
#endif
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
stor_id, cntx, rntm, thread );
// *requires nudging of nc up to be a multiple of mr.
}
}
else
@@ -136,35 +176,64 @@ err_t bli_gemmsup_int
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
const dim_t mt = bli_obj_width( c );
const dim_t nt = bli_obj_length( c );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t mu = mt / MR;
const dim_t nu = nt / NR;
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition
if ( mu >= nu )
//if ( mt % 2 == 1 && nt % 2 == 1 )
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m non-primary\n" );
#endif
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
stor_id, cntx, rntm, thread );
}
else // if ( mu < nu )
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n non-primary\n" );
#endif
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
stor_id, cntx, rntm, thread );
// *requires nudging of mc up to be a multiple of nr.
}
// *requires nudging of mc,nc up to be a multiple of nr,mr.
}
// Return success so that the caller knows that we computed the solution.

View File

@@ -41,6 +41,5 @@ err_t bli_gemmsup_int
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);

View File

@@ -42,7 +42,6 @@ void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
dim_t m, \
dim_t k, \
dim_t mr, \
@@ -58,8 +57,6 @@ void PASTEMAC(ch,opname) \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
packbuf_t pack_buf_type_use; \
\
/* NOTE: This is "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
@@ -68,21 +65,9 @@ void PASTEMAC(ch,opname) \
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
const dim_t k_pack = k; \
\
/* Determine the dimensions and strides for the packed matrix A. */ \
if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) \
{ \
/* stor3_t id values _RRC and _CRC: pack A to plain row storage,
which can use packing buffer type for general usage. */ \
pack_buf_type_use = BLIS_BUFFER_FOR_GEN_USE; \
} \
else \
{ \
/* All other stor3_t ids: pack A to column-stored row-panels
using the packing buffer type as specified by the caller. */ \
/*pack_buf_type_use = BLIS_BUFFER_FOR_A_BLOCK;*/ \
pack_buf_type_use = pack_buf_type; \
} \
/* Barrier to make sure all threads are caught up and ready to begin
the packm stage. */ \
bli_thread_obarrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -91,21 +76,40 @@ void PASTEMAC(ch,opname) \
then we need to acquire a block from the memory broker. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type_use, \
mem \
); \
} \
else \
{ \
/* NOTE: This shouldn't execute since the sup code path calls this
function only once, before *any* loops of the gemm algorithm are
encountered. */ \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was
passed in. It needs to be that mem_t struct, and not a
local (temporary) mem_t, since there is no barrier until
after packing is finished, which could allow a race
condition whereby the chief thread exits the current
function before the other threads have a chance to copy
from it. (A barrier would fix that race condition, but
then again, I prefer to keep barriers to a minimum.) */ \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the memory
broker and cached by the caller. */ \
@@ -118,18 +122,40 @@ void PASTEMAC(ch,opname) \
\
if ( mem_size < size_needed ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type_use, \
mem \
); \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_membrk_release \
( \
rntm, \
mem \
); \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
@@ -161,15 +187,19 @@ void PASTEMAC(ch,opname) \
} \
else /* if ( did_pack == TRUE ) */ \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
} \
} \
} \
}
@@ -282,8 +312,11 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_a )
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
@@ -292,6 +325,7 @@ void PASTEMAC(ch,opname) \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
@@ -300,6 +334,19 @@ void PASTEMAC(ch,opname) \
dim_t m_max; \
dim_t k_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. If packing is not requested,
this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_a) \
( \
will_pack, \
pack_buf_type, \
m_alloc, k_alloc, mr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix A. If A
will not be packed, then a_use will be set to point to a and the _a_use
@@ -323,43 +370,39 @@ void PASTEMAC(ch,opname) \
if ( will_pack == FALSE ) \
{ \
/* If we aren't going to pack matrix A, then there's nothing to do. */ \
/*
printf( "blis_ packm_sup_a: not packing A.\n" ); \
*/ \
\
/*
printf( "blis_ packm_sup_a: not packing A.\n" ); \
*/ \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
if ( schema == BLIS_PACKED_ROWS ) \
{ \
/* For plain packing by rows, use copym.
NOTE: We assume kappa = 1; otherwise, we need scal2m. */ \
/*
printf( "blis_ packm_sup_a: packing A to rows.\n" ); \
*/ \
\
/* NOTE: This call to copym must be replaced by a proper packm
variant, implemented as a loop over copym, once multithreading
support is added. */ \
\
/*
printf( "blis_ packm_sup_a: packing A to rows.\n" ); \
*/ \
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
/* For plain packing by rows, use var2. */ \
PASTEMAC(ch,packm_sup_var2) \
( \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
transc, \
schema, \
m, \
k, \
kappa, \
a, rs_a, cs_a, \
*p, *rs_p, *cs_p, \
cntx, \
NULL \
thread \
); \
} \
else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \
{ \
/*
printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
*/ \
/*
printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
*/ \
\
/* For packing to column-stored row panels, use var1. */ \
PASTEMAC(ch,packm_sup_var1) \
( \
@@ -377,6 +420,9 @@ printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
thread \
); \
} \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_obarrier( thread ); \
} \
}

View File

@@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
dim_t m, \
dim_t k, \
dim_t mr, \
@@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
@@ -97,8 +96,11 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a )
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
@@ -107,6 +109,7 @@ void PASTEMAC(ch,opname) \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \

View File

@@ -42,7 +42,6 @@ void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
dim_t k, \
dim_t n, \
dim_t nr, \
@@ -58,8 +57,6 @@ void PASTEMAC(ch,opname) \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
packbuf_t pack_buf_type_use; \
\
/* NOTE: This is "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
@@ -68,21 +65,9 @@ void PASTEMAC(ch,opname) \
const dim_t k_pack = k; \
const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
\
/* Determine the dimensions and strides for the packed matrix B. */ \
if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) \
{ \
/* stor3_t id values _RRC and _CRC: pack B to plain column storage,
which can use packing buffer type for general usage. */ \
pack_buf_type_use = BLIS_BUFFER_FOR_GEN_USE; \
} \
else \
{ \
/* All other stor3_t ids: pack A to row-stored column-panels
using the packing buffer type as specified by the caller. */ \
/*pack_buf_type_use = BLIS_BUFFER_FOR_B_PANEL;*/ \
pack_buf_type_use = pack_buf_type; \
} \
/* Barrier to make sure all threads are caught up and ready to begin
the packm stage. */ \
bli_thread_obarrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -91,21 +76,40 @@ void PASTEMAC(ch,opname) \
then we need to acquire a block from the memory broker. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type_use, \
mem \
); \
} \
else \
{ \
/* NOTE: This shouldn't execute since the sup code path calls this
function only once, before *any* loops of the gemm algorithm are
encountered. */ \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was
passed in. It needs to be that mem_t struct, and not a
local (temporary) mem_t, since there is no barrier until
after packing is finished, which could allow a race
condition whereby the chief thread exits the current
function before the other threads have a chance to copy
from it. (A barrier would fix that race condition, but
then again, I prefer to keep barriers to a minimum.) */ \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the memory
broker and cached by the caller. */ \
@@ -118,18 +122,40 @@ void PASTEMAC(ch,opname) \
\
if ( mem_size < size_needed ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type_use, \
mem \
); \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_membrk_release \
( \
rntm, \
mem \
); \
bli_membrk_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
@@ -161,15 +187,19 @@ void PASTEMAC(ch,opname) \
} \
else /* if ( did_pack == TRUE ) */ \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_membrk_release \
( \
rntm, \
mem \
); \
} \
} \
} \
}
@@ -282,8 +312,11 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_b )
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
@@ -292,6 +325,7 @@ void PASTEMAC(ch,opname) \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
@@ -300,6 +334,19 @@ void PASTEMAC(ch,opname) \
dim_t k_max; \
dim_t n_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. If packing is not requested,
this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_b) \
( \
will_pack, \
pack_buf_type, \
k_alloc, n_alloc, nr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix B. If B
will not be packed, then b_use will be set to point to b and the _b_use
@@ -323,43 +370,39 @@ void PASTEMAC(ch,opname) \
if ( will_pack == FALSE ) \
{ \
/* If we aren't going to pack matrix B, then there's nothing to do. */ \
/*
printf( "blis_ packm_sup_b: not packing B.\n" ); \
*/ \
\
/*
printf( "blis_ packm_sup_b: not packing B.\n" ); \
*/ \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
if ( schema == BLIS_PACKED_COLUMNS ) \
{ \
/* For plain packing by columns, use copym.
NOTE: We assume kappa = 1; otherwise, we need scal2m. */ \
/*
printf( "blis_ packm_sup_b: packing B to columns.\n" ); \
*/ \
\
/* NOTE: This call to copym must be replaced by a proper packm
variant, implemented as a loop over copym, once multithreading
support is added. */ \
\
/*
printf( "blis_ packm_sup_b: packing B to columns.\n" ); \
*/ \
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
/* For plain packing by columns, use var2. */ \
PASTEMAC(ch,packm_sup_var2) \
( \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
transc, \
schema, \
k, \
n, \
kappa, \
b, rs_b, cs_b, \
*p, *rs_p, *cs_p, \
cntx, \
NULL \
thread \
); \
} \
else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \
{ \
/*
printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
*/ \
/*
printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
*/ \
\
/* For packing to row-stored column panels, use var1. */ \
PASTEMAC(ch,packm_sup_var1) \
( \
@@ -377,6 +420,9 @@ printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
thread \
); \
} \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_obarrier( thread ); \
} \
}

View File

@@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
dim_t k, \
dim_t n, \
dim_t nr, \
@@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
@@ -97,16 +96,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b )
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict x, inc_t rs_x, inc_t cs_x, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \

View File

@@ -327,3 +327,137 @@ bli_thread_obarrier( thread ); \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it; \
dim_t vector_len; \
inc_t incc, ldc; \
inc_t incp, ldp; \
conj_t conjc; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool_t col_stored = bli_is_col_packed( schema ); \
/*bool_t row_stored = bli_is_row_packed( schema );*/ \
\
if ( col_stored ) \
{ \
/* Prepare to pack to a column-stored matrix. */ \
iter_dim = n; \
vector_len = m; \
incc = rs_c; \
ldc = cs_c; \
incp = 1; \
ldp = cs_p; \
} \
else /* if ( row_stored ) */ \
{ \
/* Prepare to pack to a row-stored matrix. */ \
iter_dim = m; \
vector_len = n; \
incc = cs_c; \
ldc = rs_c; \
incp = 1; \
ldp = rs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim; \
\
\
ctype* restrict p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
( void )nt; \
( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( it = 0; it < n_iter; it += 1 ) \
{ \
ctype* restrict c_begin = c_cast + (it )*ldc; \
\
ctype* restrict c_use = c_begin; \
ctype* restrict p_use = p_begin; \
\
{ \
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
( \
conjc, \
vector_len, \
kappa_cast, \
c_use, incc, \
p_use, incp, \
cntx, \
NULL \
); \
} \
\
} \
\
p_begin += ldp; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
} \
}
INSERT_GENTFUNCR_BASIC( packm, packm_sup_var2 )

View File

@@ -58,3 +58,21 @@ void PASTEMAC(ch,varname) \
INSERT_GENTPROT_BASIC0( packm_sup_var1 )
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( packm_sup_var2 )

View File

@@ -55,49 +55,47 @@ err_t bli_gemmsup_ref
bli_gemm_check( alpha, a, b, beta, c, cntx );
#if 0
// FGVZ: Will this be needed for constructing thrinfo_t's (recall: the
// sba needs to be attached to the rntm; see below)? Or will those nodes
// just be created "locally," in an exposed manner?
// NOTE: This special case handling is done within the variants.
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return BLIS_SUCCESS;
}
#endif
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
// set the ways of parallelism for each loop.
bli_rntm_set_ways_from_rntm_sup
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
bli_obj_length( c ),
bli_obj_width( c ),
bli_obj_width( a ),
rntm
);
// FGVZ: the sba needs to be attached to the rntm. But it needs
// to be done in the thread region, since it needs a thread id.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
#endif
#if 0
printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( rntm ) );
printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( rntm ) );
#endif
//bli_rntm_set_pack_a( 0, rntm );
//bli_rntm_set_pack_b( 0, rntm );
// May not need these here since packm_sup infers the schemas based
// on the stor3_t id. (This would also mean that they don't need to
// be passed into the thread decorator below.)
//pack_t schema_a = BLIS_PACKED_ROW_PANELS;
//pack_t schema_b = BLIS_PACKED_COL_PANELS;
#endif
return
bli_l3_sup_thread_decorator
(
bli_gemmsup_int,
BLIS_GEMM, // operation family id
//schema_a,
//schema_b,
alpha,
a,
b,

File diff suppressed because it is too large Load Diff

View File

@@ -51,7 +51,6 @@ void PASTEMAC0(opname) \
stor3_t eff_id, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
@@ -84,7 +83,6 @@ void PASTEMAC(ch,varname) \
stor3_t eff_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
cntl_t* restrict cntl, \
thrinfo_t* restrict thread \
);
@@ -111,10 +109,98 @@ void PASTEMAC(ch,varname) \
stor3_t eff_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
cntl_t* restrict cntl, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
// -----------------------------------------------------------------------------
static void bli_gemmsup_ref_var1n2m_opt_cases
(
num_t dt,
trans_t* trans,
bool_t packa,
bool_t packb,
stor3_t* eff_id,
cntx_t* cntx
)
{
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx );
// Handle row- and column-preferrential kernels separately.
if ( row_pref )
{
if ( packa && packb )
{
if ( *eff_id == BLIS_RRC )
{
// Since C is already row-stored, we can use BLIS_RRR kernel instead.
*eff_id = BLIS_RRR;
}
else if ( *eff_id == BLIS_CRC )
{
// BLIS_RRC when transposed below (both matrices still packed).
// This allows us to use the BLIS_RRR kernel instead.
*eff_id = BLIS_CCC; // BLIS_RRR when transposed below.
}
else if ( *eff_id == BLIS_CRR )
{
// Induce a transpose to make C row-stored.
// BLIS_RCC when transposed below (both matrices still packed).
// This allows us to use the BLIS_RRR kernel instead.
*trans = bli_trans_toggled( *trans );
*eff_id = BLIS_CCC; // BLIS_RRR when transposed below.
}
}
else if ( packb )
{
if ( *eff_id == BLIS_RRC )
{
// Since C is already row-stored, we can use BLIS_RRR kernel instead.
*eff_id = BLIS_RRR;
}
else if ( *eff_id == BLIS_CRC )
{
// BLIS_RRC when transposed below (with packa instead of packb).
// No transformation is beneficial here.
}
else if ( *eff_id == BLIS_RCC )
{
// C is already row-stored; cancel transposition and use BLIS_RCR
// kernel instead.
*trans = bli_trans_toggled( *trans );
*eff_id = BLIS_RCR;
}
#if 0
// This transformation performs poorly. Theory: packing A (formerly B)
// when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow
// and kills the performance?
else if ( eff_id == BLIS_CRR )
{
trans = bli_trans_toggled( trans );
eff_id = BLIS_CRC; // BLIS_RRC when transposed below.
}
#endif
}
else if ( packa )
{
if ( *eff_id == BLIS_CRR )
{
// Induce a transpose to make C row-stored.
// BLIS_RCC when transposed below (both matrices still packed).
// This allows us to use the BLIS_RRR kernel instead.
*trans = bli_trans_toggled( *trans );
*eff_id = BLIS_CCR; // BLIS_RCR when transposed below.
}
}
}
else
{
//bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" );
bli_abort();
}
}

View File

@@ -53,6 +53,15 @@ void bli_l3_thrinfo_free
bli_thrinfo_free( rntm, thread );
}
void bli_l3_sup_thrinfo_free
(
rntm_t* rntm,
thrinfo_t* thread
)
{
bli_thrinfo_free( rntm, thread );
}
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_create_root
@@ -94,6 +103,74 @@ void bli_l3_thrinfo_create_root
// -----------------------------------------------------------------------------
void bli_l3_sup_thrinfo_create_root
(
dim_t id,
thrcomm_t* gl_comm,
rntm_t* rntm,
thrinfo_t** thread
)
{
// Query the global communicator for the total number of threads to use.
dim_t n_threads = bli_thrcomm_num_threads( gl_comm );
// Use the thread id passed in as the global communicator id.
dim_t gl_comm_id = id;
// Use the BLIS_NC blocksize id to query the top-most ways of parallelism
// to obtain. Note that hard-coding BLIS_NC like this is a little bit of a
// hack, but it works fine since both of the sup algorithms (bp and pb) use
// the cache blocksizes down to the 3rd loop. (See the definitions of
// bli_rntm_calc_num_threads_bp() and bli_rntm_calc_num_threads_pb() for
// a concise enumeration of these bszid_t ids.)
const bszid_t bszid = BLIS_NC;
dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm );
// Determine the work id for this thrinfo_t node.
dim_t work_id = gl_comm_id / ( n_threads / xx_way );
// Create the root thrinfo_t node.
*thread = bli_thrinfo_create
(
rntm,
gl_comm,
gl_comm_id,
xx_way,
work_id,
TRUE,
bszid,
NULL
);
}
// -----------------------------------------------------------------------------
void bli_l3_sup_thrinfo_update_root
(
rntm_t* rntm,
thrinfo_t* thread
)
{
// Query the current root for the total number of threads to use.
const dim_t n_threads = bli_thread_num_threads( thread );
// Query the current root for the (global) comm id.
const dim_t gl_comm_id = bli_thread_ocomm_id( thread );
// Query the rntm_t for the updated number of ways of parallelism.
const dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm );
// Recompute the work id for this thrinfo_t node using the updated
// number of ways of parallelism.
dim_t work_id = gl_comm_id / ( n_threads / xx_way );
// Save the updated ways of parallelism and work id to the thrinfo_t node.
bli_thrinfo_set_n_way( xx_way, thread );
bli_thrinfo_set_work_id( work_id, thread );
}
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_print_gemm_paths
(
thrinfo_t** threads

View File

@@ -93,6 +93,12 @@ void bli_l3_thrinfo_free
thrinfo_t* thread
);
void bli_l3_sup_thrinfo_free
(
rntm_t* rntm,
thrinfo_t* thread
);
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_create_root
@@ -104,6 +110,20 @@ void bli_l3_thrinfo_create_root
thrinfo_t** thread
);
void bli_l3_sup_thrinfo_create_root
(
dim_t id,
thrcomm_t* gl_comm,
rntm_t* rntm,
thrinfo_t** thread
);
void bli_l3_sup_thrinfo_update_root
(
rntm_t* rntm,
thrinfo_t* thread
);
void bli_l3_thrinfo_print_gemm_paths
(
thrinfo_t** threads

View File

@@ -360,7 +360,7 @@ dim_t bli_cntl_calc_num_threads_in
bszid_t bszid = bli_cntl_bszid( cntl );
dim_t cur_way;
// We assume bszid is in {KR,MR,NR,MC,KC,NR} if it is not
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
cur_way = bli_rntm_ways_for( bszid, rntm );

View File

@@ -169,14 +169,18 @@ void bli_rntm_set_ways_from_rntm
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
bool_t auto_factor = FALSE;
#ifdef BLIS_ENABLE_MULTITHREADING
bool_t nt_set = FALSE;
bool_t ways_set = FALSE;
// If the rntm was fed in as a copy of the global runtime via
// bli_rntm_init_from_global(), we know that either the num_threads
// field will be set and all of the ways unset, or vice versa.
// bli_rntm_init_from_global(), we know that either:
// - the num_threads field is -1 and all of the ways are -1;
// - the num_threads field is -1 and all of the ways are set;
// - the num_threads field is set and all of the ways are -1.
// However, we can't be sure that a user-provided rntm_t isn't
// initialized uncleanly. So here we have to enforce some rules
// to get the rntm_t into a predictable state.
@@ -184,6 +188,9 @@ void bli_rntm_set_ways_from_rntm
// First, we establish whether or not the number of threads is set.
if ( nt > 0 ) nt_set = TRUE;
// Take this opportunity to set the auto_factor field.
if ( nt_set ) auto_factor = TRUE;
// Next, we establish whether or not any of the ways of parallelism
// for each loop were set. If any of the ways are set (positive), we
// then we assume the user wanted to use those positive values and
@@ -220,8 +227,8 @@ void bli_rntm_set_ways_from_rntm
pc = 1;
bli_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
n*BLIS_THREAD_RATIO_N, &ic, &jc );
bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
n*BLIS_THREAD_RATIO_N, &ic, &jc );
for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- )
{
@@ -253,12 +260,16 @@ void bli_rntm_set_ways_from_rntm
#endif
// Save the results back in the runtime object.
bli_rntm_set_auto_factor_only( auto_factor, rntm );
bli_rntm_set_num_threads_only( nt, rntm );
bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
}
void bli_rntm_print
void bli_rntm_set_ways_from_rntm_sup
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
)
{
@@ -270,8 +281,193 @@ void bli_rntm_print
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
bool_t auto_factor = FALSE;
#ifdef BLIS_ENABLE_MULTITHREADING
bool_t nt_set = FALSE;
bool_t ways_set = FALSE;
// If the rntm was fed in as a copy of the global runtime via
// bli_rntm_init_from_global(), we know that either:
// - the num_threads field is -1 and all of the ways are -1;
// - the num_threads field is -1 and all of the ways are set;
// - the num_threads field is set and all of the ways are -1.
// However, we can't be sure that a user-provided rntm_t isn't
// initialized uncleanly. So here we have to enforce some rules
// to get the rntm_t into a predictable state.
// First, we establish whether or not the number of threads is set.
if ( nt > 0 ) nt_set = TRUE;
// Take this opportunity to set the auto_factor field.
if ( nt_set ) auto_factor = TRUE;
// Next, we establish whether or not any of the ways of parallelism
// for each loop were set. If any of the ways are set (positive), we
// then we assume the user wanted to use those positive values and
// default the non-positive values to 1.
if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 )
{
ways_set = TRUE;
if ( jc < 1 ) jc = 1;
if ( pc < 1 ) pc = 1;
if ( ic < 1 ) ic = 1;
if ( jr < 1 ) jr = 1;
if ( ir < 1 ) ir = 1;
}
// Now we use the values of nt_set and ways_set to determine how to
// interpret the original values we found in the rntm_t object.
if ( ways_set == TRUE )
{
// If the ways were set, then we use the values that were given
// and interpreted above (we set any non-positive value to 1).
// The only thing left to do is calculate the correct number of
// threads.
nt = jc * pc * ic * jr * ir;
}
else if ( ways_set == FALSE && nt_set == TRUE )
{
// If the ways were not set but the number of threas was set, then
// we attempt to automatically generate a thread factorization that
// will work given the problem size. Thus, here we only set the
// ways and leave the number of threads unchanged.
pc = 1;
//bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M,
// n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc );
bli_thread_partition_2x2( nt, m,
n, &ic, &jc );
//printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d ic = %d\n", (int)jc, (int)ic );
#if 0
for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- )
{
if ( ic % ir == 0 ) { ic /= ir; break; }
}
for ( jr = BLIS_THREAD_SUP_MAX_JR ; jr > 1 ; jr-- )
{
if ( jc % jr == 0 ) { jc /= jr; break; }
}
#else
ir = 1;
jr = 1;
#endif
}
else // if ( ways_set == FALSE && nt_set == FALSE )
{
// If neither the ways nor the number of threads were set, then
// the rntm was not meaningfully changed since initialization,
// and thus we'll default to single-threaded execution.
nt = 1;
jc = pc = ic = jr = ir = 1;
}
#else
// When multithreading is disabled, always set the rntm_t ways
// values to 1.
nt = 1;
jc = pc = ic = jr = ir = 1;
#endif
// Save the results back in the runtime object.
bli_rntm_set_auto_factor_only( auto_factor, rntm );
bli_rntm_set_num_threads_only( nt, rntm );
bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
}
void bli_rntm_print
(
rntm_t* rntm
)
{
dim_t af = bli_rntm_auto_factor( rntm );
dim_t nt = bli_rntm_num_threads( rntm );
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
printf( "rntm contents nt jc pc ic jr ir\n" );
printf( " %4d%4d%4d%4d%4d%4d\n", (int)nt, (int)jc, (int)pc,
printf( "autofac? %1d | %4d%4d%4d%4d%4d%4d\n", (int)af,
(int)nt, (int)jc, (int)pc,
(int)ic, (int)jr, (int)ir );
}
// -----------------------------------------------------------------------------
dim_t bli_rntm_calc_num_threads_in
(
bszid_t* restrict bszid_cur,
rntm_t* restrict rntm
)
{
/* // bp algorithm:
bszid_t bszids[7] = { BLIS_NC, // level 0: 5th loop
BLIS_KC, // level 1: 4th loop
BLIS_NO_PART, // level 2: pack B
BLIS_MC, // level 3: 3rd loop
BLIS_NO_PART, // level 4: pack A
BLIS_NR, // level 5: 2nd loop
BLIS_MR, // level 6: 1st loop
BLIS_KR // level 7: ukr loop
... // pb algorithm:
BLIS_NR, // level 5: 2nd loop
BLIS_MR, // level 6: 1st loop
BLIS_KR // level 7: ukr loop
}; */
dim_t n_threads_in = 1;
// Starting with the current element of the bszids array (pointed
// to by bszid_cur), multiply all of the corresponding ways of
// parallelism.
for ( ; *bszid_cur != BLIS_KR; bszid_cur++ )
{
const bszid_t bszid = *bszid_cur;
//if ( bszid == BLIS_KR ) break;
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
{
const dim_t cur_way = bli_rntm_ways_for( bszid, rntm );
n_threads_in *= cur_way;
}
}
return n_threads_in;
}
#if 0
for ( ; *bszid_cur != BLIS_KR; bszid_cur++ )
{
const bszid_t bszid = *bszid_cur;
dim_t cur_way = 1;
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
cur_way = bli_rntm_ways_for( bszid, rntm );
else
cur_way = 1;
n_threads_in *= cur_way;
}
#endif

View File

@@ -43,6 +43,8 @@
/*
typedef struct rntm_s
{
bool_t auto_factor;
dim_t num_threads;
dim_t* thrloop;
dim_t pack_a;
@@ -59,6 +61,11 @@ typedef struct rntm_s
// -- rntm_t query (public API) ------------------------------------------------
//
static bool_t bli_rntm_auto_factor( rntm_t* rntm )
{
return rntm->auto_factor;
}
static dim_t bli_rntm_num_threads( rntm_t* rntm )
{
return rntm->num_threads;
@@ -122,6 +129,7 @@ static membrk_t* bli_rntm_membrk( rntm_t* rntm )
return rntm->membrk;
}
#if 0
static dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 )
{
const bool_t nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 );
@@ -135,11 +143,17 @@ static dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 )
if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE;
else return FALSE;
}
#endif
//
// -- rntm_t modification (internal use only) ----------------------------------
//
static void bli_rntm_set_auto_factor_only( bool_t auto_factor, rntm_t* rntm )
{
rntm->auto_factor = auto_factor;
}
static void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm )
{
rntm->num_threads = nt;
@@ -292,17 +306,20 @@ static void bli_rntm_clear_l3_sup( rntm_t* rntm )
#define BLIS_RNTM_INITIALIZER \
{ \
.auto_factor = TRUE, \
.num_threads = -1, \
.thrloop = { -1, -1, -1, -1, -1, -1 }, \
.pack_a = TRUE, \
.pack_b = TRUE, \
.l3_sup = TRUE \
.l3_sup = TRUE, \
.sba_pool = NULL, \
.membrk = NULL, \
} \
static void bli_rntm_init( rntm_t* rntm )
{
bli_rntm_set_auto_factor_only( TRUE, rntm );
bli_rntm_clear_num_threads_only( rntm );
bli_rntm_clear_ways_only( rntm );
bli_rntm_clear_pack_a( rntm );
@@ -313,6 +330,24 @@ static void bli_rntm_init( rntm_t* rntm )
bli_rntm_clear_membrk( rntm );
}
// -- rntm_t total thread calculation ------------------------------------------
static dim_t bli_rntm_calc_num_threads
(
rntm_t* restrict rntm
)
{
dim_t n_threads;
n_threads = bli_rntm_ways_for( BLIS_NC, rntm );
n_threads *= bli_rntm_ways_for( BLIS_KC, rntm );
n_threads *= bli_rntm_ways_for( BLIS_MC, rntm );
n_threads *= bli_rntm_ways_for( BLIS_NR, rntm );
n_threads *= bli_rntm_ways_for( BLIS_MR, rntm );
return n_threads;
}
// -----------------------------------------------------------------------------
// Function prototypes
@@ -337,10 +372,24 @@ void bli_rntm_set_ways_from_rntm
rntm_t* rntm
);
void bli_rntm_set_ways_from_rntm_sup
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
);
void bli_rntm_print
(
rntm_t* rntm
);
dim_t bli_rntm_calc_num_threads_in
(
bszid_t* restrict bszid_cur,
rntm_t* restrict rntm
);
#endif

View File

@@ -38,6 +38,8 @@
// -- Define default threading parameters --------------------------------------
// -- Conventional (large code path) values --
#ifndef BLIS_THREAD_RATIO_M
#define BLIS_THREAD_RATIO_M 2
#endif
@@ -54,6 +56,26 @@
#define BLIS_THREAD_MAX_JR 4
#endif
#if 0
// -- Skinny/small possibly-unpacked (sup code path) values --
#ifndef BLIS_THREAD_SUP_RATIO_M
#define BLIS_THREAD_SUP_RATIO_M 1
#endif
#ifndef BLIS_THREAD_SUP_RATIO_N
#define BLIS_THREAD_SUP_RATIO_N 2
#endif
#ifndef BLIS_THREAD_SUP_MAX_IR
#define BLIS_THREAD_SUP_MAX_IR 1
#endif
#ifndef BLIS_THREAD_SUP_MAX_JR
#define BLIS_THREAD_SUP_MAX_JR 8
#endif
#endif
// -- Memory allocation --------------------------------------------------------

View File

@@ -1452,6 +1452,7 @@ typedef struct cntx_s
typedef struct rntm_s
{
// "External" fields: these may be queried by the end-user.
bool_t auto_factor;
dim_t num_threads;
dim_t thrloop[ BLIS_NUM_LOOPS ];

View File

@@ -125,7 +125,7 @@ void bli_l3_thread_decorator
// Alias thread-local copies of A, B, and C. These will be the objects
// we pass down the algorithmic function stack. Making thread-local
// alaises is highly recommended in case a thread needs to change any
// aliases is highly recommended in case a thread needs to change any
// of the properties of an object without affecting other threads'
// objects.
bli_obj_alias_to( a, &a_t );

View File

@@ -96,7 +96,7 @@ void* bli_l3_thread_entry( void* data_void )
// Alias thread-local copies of A, B, and C. These will be the objects
// we pass down the algorithmic function stack. Making thread-local
// alaises is highly recommended in case a thread needs to change any
// aliases is highly recommended in case a thread needs to change any
// of the properties of an object without affecting other threads'
// objects.
bli_obj_alias_to( a, &a_t );

View File

@@ -48,7 +48,6 @@ typedef err_t (*l3supint_t)
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
@@ -57,8 +56,6 @@ err_t bli_l3_sup_thread_decorator
(
l3supint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,

View File

@@ -40,16 +40,14 @@
// Define a dummy function bli_l3_sup_thread_entry(), which is needed in the
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
// or no multithreading) we don't risk having an unresolved symbol.
//void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
//#define PRINT_THRINFO
err_t bli_l3_sup_thread_decorator
(
l3supint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,
@@ -59,36 +57,8 @@ err_t bli_l3_sup_thread_decorator
rntm_t* rntm
)
{
#if 0
return
bli_gemmsup_int
(
alpha,
a,
b,
beta,
c,
cntx,
rntm,
0
);
#else
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
//pack_t schema_a = bli_obj_pack_schema( a );
//pack_t schema_b = bli_obj_pack_schema( b );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// Query the total number of threads from the rntm_t object.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
@@ -99,55 +69,45 @@ err_t bli_l3_sup_thread_decorator
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
#if 0
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
_Pragma( "omp parallel num_threads(n_threads)" )
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
cntl_t* cntl_use = NULL;
//thrinfo_t* thread = NULL;
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
const dim_t tid = 0;
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
//bli_l3_cntl_create_if( family, schema_a, schema_b,
// a, b, c, rntm_p, cntl, &cntl_use );
#if 0
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
thrinfo_t* thread = NULL;
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#endif
( void )tid;
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
func
(
@@ -158,23 +118,16 @@ err_t bli_l3_sup_thread_decorator
c,
cntx,
rntm_p,
cntl_use,
thread
);
#if 0
// Free the thread's local control tree.
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
#endif
bli_l3_sup_thrinfo_free( rntm_p, thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// (called from the thread entry function).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
@@ -182,8 +135,6 @@ err_t bli_l3_sup_thread_decorator
bli_sba_checkin_array( array );
return BLIS_SUCCESS;
#endif
}
#endif

View File

@@ -37,12 +37,82 @@
#ifdef BLIS_ENABLE_PTHREADS
// A data structure to assist in passing operands to additional threads.
typedef struct thread_data
{
l3supint_t func;
opid_t family;
obj_t* alpha;
obj_t* a;
obj_t* b;
obj_t* beta;
obj_t* c;
cntx_t* cntx;
rntm_t* rntm;
dim_t tid;
thrcomm_t* gl_comm;
array_t* array;
} thread_data_t;
// Entry point for additional threads
void* bli_l3_sup_thread_entry( void* data_void )
{
thread_data_t* data = data_void;
l3supint_t func = data->func;
opid_t family = data->family;
obj_t* alpha = data->alpha;
obj_t* a = data->a;
obj_t* b = data->b;
obj_t* beta = data->beta;
obj_t* c = data->c;
cntx_t* cntx = data->cntx;
rntm_t* rntm = data->rntm;
dim_t tid = data->tid;
array_t* array = data->array;
thrcomm_t* gl_comm = data->gl_comm;
( void )family;
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
thrinfo_t* thread = NULL;
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
thread
);
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
return NULL;
}
err_t bli_l3_sup_thread_decorator
(
l3supint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,
@@ -52,36 +122,8 @@ err_t bli_l3_sup_thread_decorator
rntm_t* rntm
)
{
#if 0
return
bli_gemmsup_int
(
alpha,
a,
b,
beta,
c,
cntx,
rntm,
0
);
#else
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
//pack_t schema_a = bli_obj_pack_schema( a );
//pack_t schema_b = bli_obj_pack_schema( b );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// Query the total number of threads from the context.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
@@ -92,91 +134,82 @@ err_t bli_l3_sup_thread_decorator
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
#if 0
// Allcoate a global communicator for the root thrinfo_t structures.
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
// Allocate an array of pthread objects and auxiliary data structs to pass
// to the thread entry functions.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
// can spawn all other threads before proceeding with its own computation.
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
// Set up thread data for additional threads (beyond thread 0).
datas[tid].func = func;
datas[tid].family = family;
datas[tid].alpha = alpha;
datas[tid].a = a;
datas[tid].b = b;
datas[tid].beta = beta;
datas[tid].c = c;
datas[tid].cntx = cntx;
datas[tid].rntm = rntm;
datas[tid].tid = tid;
datas[tid].gl_comm = gl_comm;
datas[tid].array = array;
cntl_t* cntl_use = NULL;
//thrinfo_t* thread = NULL;
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
//bli_l3_cntl_create_if( family, schema_a, schema_b,
// a, b, c, rntm_p, cntl, &cntl_use );
#if 0
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#endif
( void )tid;
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
cntl_use,
thread
);
#if 0
// Free the thread's local control tree.
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
#endif
// Spawn additional threads for ids greater than 1.
if ( tid != 0 )
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_sup_thread_entry, &datas[tid] );
else
bli_l3_sup_thread_entry( ( void* )(&datas[0]) );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// (called from the thread entry function).
// Thread 0 waits for additional threads to finish.
for ( dim_t tid = 1; tid < n_threads; tid++ )
{
bli_pthread_join( pthreads[tid], NULL );
}
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
return BLIS_SUCCESS;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( pthreads );
#endif
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( datas );
return BLIS_SUCCESS;
}
#endif

View File

@@ -969,7 +969,7 @@ siz_t bli_thread_range_weighted_b2t
void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors )
{
factors->n = n;
factors->sqrt_n = (dim_t)sqrt(n);
factors->sqrt_n = ( dim_t )sqrt( ( double )n );
factors->f = 2;
}
@@ -1040,26 +1040,36 @@ dim_t bli_next_prime_factor( bli_prime_factors_t* factors )
return tmp;
}
void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2,
dim_t* nt1, dim_t* nt2 )
#if 0
#include "limits.h"
#endif
void bli_thread_partition_2x2
(
dim_t n_thread,
dim_t work1,
dim_t work2,
dim_t* restrict nt1,
dim_t* restrict nt2
)
{
// Partition a number of threads into two factors nt1 and nt2 such that
// nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a
// slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|).
// Return early small prime numbers of threads.
if (nthread < 4)
if ( n_thread < 4 )
{
*nt1 = ( work1 >= work2 ? nthread : 1 );
*nt2 = ( work1 < work2 ? nthread : 1 );
*nt1 = ( work1 >= work2 ? n_thread : 1 );
*nt2 = ( work1 < work2 ? n_thread : 1 );
}
*nt1 = 1;
*nt2 = 1;
// Both algorithms need the prime factorization of nthread.
// Both algorithms need the prime factorization of n_thread.
bli_prime_factors_t factors;
bli_prime_factorization( nthread, &factors );
bli_prime_factorization( n_thread, &factors );
#if 1
@@ -1086,10 +1096,10 @@ void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2,
#else
// Slow algorithm: exhaustively constructs all factor pairs of nthread and
// Slow algorithm: exhaustively constructs all factor pairs of n_thread and
// chooses the best one.
// Eight prime factors handles nthread up to 223092870.
// Eight prime factors handles n_thread up to 223092870.
dim_t fact[8];
dim_t mult[8];
@@ -1123,7 +1133,7 @@ void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2,
// Loop over how many prime factors to assign to the first factor in the
// pair, for each prime factor. The total number of iterations is
// \Prod_{i=0}^{nfact-1} mult[i].
bool done = false;
bool_t done = FALSE;
while ( !done )
{
dim_t x = 1;
@@ -1152,7 +1162,7 @@ void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2,
if ( ++ntake[i] > mult[i] )
{
ntake[i] = 0;
if ( i == nfact-1 ) done = true;
if ( i == nfact-1 ) done = TRUE;
else continue;
}
break;
@@ -1284,8 +1294,9 @@ void bli_thread_init_rntm_from_env
// function is only called from bli_thread_init(), which is only called
// by bli_init_once().
dim_t nt;
dim_t jc, pc, ic, jr, ir;
bool_t auto_factor = FALSE;
dim_t nt;
dim_t jc, pc, ic, jr, ir;
#ifdef BLIS_ENABLE_MULTITHREADING
@@ -1306,8 +1317,8 @@ void bli_thread_init_rntm_from_env
// If any BLIS_*_NT environment variable was set, then we ignore the
// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the
// BLIS_*_NT values instead (with unset variables being assumed to
// contain 1).
// BLIS_*_NT values instead (with unset variables being treated as if
// they contained 1).
if ( jc != -1 || pc != -1 || ic != -1 || jr != -1 || ir != -1 )
{
if ( jc == -1 ) jc = 1;
@@ -1320,9 +1331,14 @@ void bli_thread_init_rntm_from_env
nt = -1;
}
// By this time, either nt is set and the ways for each loop
// are all unset, OR nt is unset and the ways for each loop
// are all set.
// By this time, one of the following conditions holds:
// - nt is -1 and the ways for each loop are -1.
// - nt is -1 and the ways for each loop are all set.
// - nt is set and the ways for each loop are -1.
// If nt is set (ie: not -1), then we know we will perform an automatic
// thread factorization (later, in bli_rntm.c).
if ( nt != -1 ) auto_factor = TRUE;
#else
@@ -1334,6 +1350,7 @@ void bli_thread_init_rntm_from_env
#endif
// Save the results back in the runtime object.
bli_rntm_set_auto_factor_only( auto_factor, rntm );
bli_rntm_set_num_threads_only( nt, rntm );
bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );

View File

@@ -42,6 +42,7 @@
// Include thread info (thrinfo_t) object definitions and prototypes.
#include "bli_thrinfo.h"
#include "bli_thrinfo_sup.h"
// Include some operation-specific thrinfo_t prototypes.
// Note that the bli_packm_thrinfo.h must be included before the others!
@@ -164,7 +165,14 @@ void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors);
dim_t bli_next_prime_factor(bli_prime_factors_t* factors);
void bli_partition_2x2(dim_t nthread, dim_t work1, dim_t work2, dim_t* nt1, dim_t* nt2);
void bli_thread_partition_2x2
(
dim_t n_thread,
dim_t work1,
dim_t work2,
dim_t* restrict nt1,
dim_t* restrict nt2
);
// -----------------------------------------------------------------------------

View File

@@ -78,15 +78,15 @@ void bli_thrinfo_init
thrinfo_t* sub_node
)
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->free_comm = free_comm;
thread->bszid = bszid;
bli_thrinfo_set_ocomm( ocomm, thread );
bli_thrinfo_set_ocomm_id( ocomm_id, thread );
bli_thrinfo_set_n_way( n_way, thread );
bli_thrinfo_set_work_id( work_id, thread );
bli_thrinfo_set_free_comm( free_comm, thread );
bli_thrinfo_set_bszid( bszid, thread );
thread->sub_prenode = NULL;
thread->sub_node = sub_node;
bli_thrinfo_set_sub_node( sub_node, thread );
bli_thrinfo_set_sub_prenode( NULL, thread );
}
void bli_thrinfo_init_single

View File

@@ -129,6 +129,36 @@ static bool_t bli_thread_am_ochief( thrinfo_t* t )
// thrinfo_t modification
static void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t )
{
t->ocomm = ocomm;
}
static void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t )
{
t->ocomm_id = ocomm_id;
}
static void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t )
{
t->n_way = n_way;
}
static void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t )
{
t->work_id = work_id;
}
static void bli_thrinfo_set_free_comm( bool_t free_comm, thrinfo_t* t )
{
t->free_comm = free_comm;
}
static void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t )
{
t->bszid = bszid;
}
static void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t )
{
t->sub_node = sub_node;

View File

@@ -0,0 +1,241 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_thrinfo_sup_grow
(
rntm_t* rntm,
bszid_t* bszid_par,
thrinfo_t* thread
)
{
// NOTE: If bli_thrinfo_sup_rgrow() is being called, the sub_node field will
// always be non-NULL, and so there's no need to check it.
//if ( bli_cntl_sub_node( cntl ) != NULL )
{
// We only need to take action if the thrinfo_t sub-node is NULL; if it
// is non-NULL, then it has already been created and we'll use it as-is.
if ( bli_thrinfo_sub_node( thread ) == NULL )
{
// Create a new node (or, if needed, multiple nodes) along the
// main sub-node branch of the tree and return the pointer to the
// (highest) child.
thrinfo_t* thread_child = bli_thrinfo_sup_rgrow
(
rntm,
bszid_par,
&bszid_par[1],
thread
);
// Attach the child thrinfo_t node for the primary branch to its
// parent structure.
bli_thrinfo_set_sub_node( thread_child, thread );
}
}
}
// -----------------------------------------------------------------------------
thrinfo_t* bli_thrinfo_sup_rgrow
(
rntm_t* rntm,
bszid_t* bszid_par,
bszid_t* bszid_cur,
thrinfo_t* thread_par
)
{
thrinfo_t* thread_cur;
// We must handle two cases: those where the next node in the
// control tree is a partitioning node, and those where it is
// a non-partitioning (ie: packing) node.
if ( *bszid_cur != BLIS_NO_PART )
{
// Create the child thrinfo_t node corresponding to cntl_cur,
// with cntl_par being the parent.
thread_cur = bli_thrinfo_sup_create_for_cntl
(
rntm,
bszid_par,
bszid_cur,
thread_par
);
}
else // if ( *bszid_cur == BLIS_NO_PART )
{
// Recursively grow the thread structure and return the top-most
// thrinfo_t node of that segment.
thrinfo_t* thread_seg = bli_thrinfo_sup_rgrow
(
rntm,
bszid_par,
&bszid_cur[1],
thread_par
);
// Create a thrinfo_t node corresponding to cntl_cur. Since the
// corresponding cntl node, cntl_cur, is a non-partitioning node
// (bszid = BLIS_NO_PART), this means it's a packing node. Packing
// thrinfo_t nodes are formed differently than those corresponding to
// partitioning nodes; specifically, their work_id's are set equal to
// the their comm_id's. Also, notice that the free_comm field is set
// to FALSE since cntl_cur is a non-partitioning node. The reason:
// the communicator used here will be freed when thread_seg, or one
// of its descendents, is freed.
thread_cur = bli_thrinfo_create
(
rntm, // rntm
bli_thrinfo_ocomm( thread_seg ), // ocomm
bli_thread_ocomm_id( thread_seg ), // ocomm_id
bli_rntm_calc_num_threads_in( bszid_cur, rntm ), // n_way
bli_thread_ocomm_id( thread_seg ), // work_id
FALSE, // free_comm
BLIS_NO_PART, // bszid
thread_seg // sub_node
);
}
return thread_cur;
}
#define BLIS_NUM_STATIC_COMMS 80
thrinfo_t* bli_thrinfo_sup_create_for_cntl
(
rntm_t* rntm,
bszid_t* bszid_par,
bszid_t* bszid_chl,
thrinfo_t* thread_par
)
{
#if 1
// If we are running with a single thread, all of the code can be reduced
// and simplified to this.
if ( bli_rntm_calc_num_threads( rntm ) == 1 )
{
thrinfo_t* thread_chl = bli_thrinfo_create
(
rntm, // rntm
&BLIS_SINGLE_COMM, // ocomm
0, // ocomm_id
1, // n_way
0, // work_id
FALSE, // free_comm
BLIS_NO_PART, // bszid
NULL // sub_node
);
return thread_chl;
}
#endif
thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ];
thrcomm_t** new_comms = NULL;
const dim_t parent_nt_in = bli_thread_num_threads( thread_par );
const dim_t parent_n_way = bli_thread_n_way( thread_par );
const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
const dim_t parent_work_id = bli_thread_work_id( thread_par );
// Sanity check: make sure the number of threads in the parent's
// communicator is divisible by the number of new sub-groups.
if ( parent_nt_in % parent_n_way != 0 )
{
printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
bli_abort();
}
// Compute:
// - the number of threads inside the new child comm,
// - the current thread's id within the new communicator,
// - the current thread's work id, given the ways of parallelism
// to be obtained within the next loop.
const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm );
const dim_t child_comm_id = parent_comm_id % child_nt_in;
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
// The parent's chief thread creates a temporary array of thrcomm_t
// pointers.
if ( bli_thread_am_ochief( thread_par ) )
{
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ) );
else
new_comms = static_comms;
}
// Broadcast the temporary array to all threads in the parent's
// communicator.
new_comms = bli_thread_obroadcast( thread_par, new_comms );
// Chiefs in the child communicator allocate the communicator
// object and store it in the array element corresponding to the
// parent's work id.
if ( child_comm_id == 0 )
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
bli_thread_obarrier( thread_par );
// All threads create a new thrinfo_t node using the communicator
// that was created by their chief, as identified by parent_work_id.
thrinfo_t* thread_chl = bli_thrinfo_create
(
rntm, // rntm
new_comms[ parent_work_id ], // ocomm
child_comm_id, // ocomm_id
child_n_way, // n_way
child_work_id, // work_id
TRUE, // free_comm
*bszid_chl, // bszid
NULL // sub_node
);
bli_thread_obarrier( thread_par );
// The parent's chief thread frees the temporary array of thrcomm_t
// pointers.
if ( bli_thread_am_ochief( thread_par ) )
{
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
bli_free_intl( new_comms );
}
return thread_chl;
}

View File

@@ -0,0 +1,66 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_THRINFO_SUP_H
#define BLIS_THRINFO_SUP_H
//
// Prototypes for level-3 thrinfo sup functions.
//
void bli_thrinfo_sup_grow
(
rntm_t* rntm,
bszid_t* bszid_par,
thrinfo_t* thread
);
thrinfo_t* bli_thrinfo_sup_rgrow
(
rntm_t* rntm,
bszid_t* bszid_par,
bszid_t* bszid_cur,
thrinfo_t* thread_par
);
thrinfo_t* bli_thrinfo_sup_create_for_cntl
(
rntm_t* rntm,
bszid_t* bszid_par,
bszid_t* bszid_chl,
thrinfo_t* thread_par
);
#endif

580
test/supmt/Makefile Normal file
View File

@@ -0,0 +1,580 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2019, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
#
# Makefile
#
# Field G. Van Zee
#
# Makefile for standalone BLIS test drivers.
#
#
# --- Makefile PHONY target definitions ----------------------------------------
#
.PHONY: all all-st all-mt \
blis blis-st blis-mt \
clean cleanx
#
# --- Determine makefile fragment location -------------------------------------
#
# Comments:
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
# the second case because CONFIG_NAME is not yet set.
ifneq ($(strip $(BLIS_INSTALL_PATH)),)
LIB_PATH := $(BLIS_INSTALL_PATH)/lib
INC_PATH := $(BLIS_INSTALL_PATH)/include/blis
SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
else
DIST_PATH := ../..
LIB_PATH = ../../lib/$(CONFIG_NAME)
INC_PATH = ../../include/$(CONFIG_NAME)
SHARE_PATH := ../..
endif
#
# --- Include common makefile definitions --------------------------------------
#
# Include the common makefile fragment.
-include $(SHARE_PATH)/common.mk
#
# --- BLAS and LAPACK implementations ------------------------------------------
#
# BLIS library and header path. This is simply wherever it was installed.
#BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
#BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
# BLIS library.
#BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
# BLAS library path(s). This is where the BLAS libraries reside.
HOME_LIB_PATH := $(HOME)/flame/lib
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
# netlib BLAS
NETLIB_LIB := $(HOME_LIB_PATH)/libblas.a
# OpenBLAS
OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a
OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a
# BLASFEO
BLASFEO_LIB := $(HOME_LIB_PATH)/libblasfeo.a
# libxsmm
LIBXSMM_LIB := $(HOME_LIB_PATH)/libxsmm.a -ldl \
$(NETLIB_LIB) -lgfortran
# ATLAS
ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
$(HOME_LIB_PATH)/libatlas.a
# Eigen
EIGEN_INC := $(HOME)/flame/eigen/include/eigen3
EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a
EIGENP_LIB := $(EIGEN_LIB)
# MKL
MKL_LIB := -L$(MKL_LIB_PATH) \
-lmkl_intel_lp64 \
-lmkl_core \
-lmkl_sequential \
-lpthread -lm -ldl
MKLP_LIB := -L$(MKL_LIB_PATH) \
-lmkl_intel_lp64 \
-lmkl_core \
-lmkl_gnu_thread \
-lpthread -lm -ldl -fopenmp
#-L$(ICC_LIB_PATH) \
#-lgomp
VENDOR_LIB := $(MKL_LIB)
VENDORP_LIB := $(MKLP_LIB)
#
# --- Problem size definitions -------------------------------------------------
#
# Single core
PS_BEGIN := 4
PS_MAX := 800
PS_INC := 4
# Multicore
P1_BEGIN := 8
P1_MAX := 1600
P1_INC := 8
#
# --- General build definitions ------------------------------------------------
#
TEST_SRC_PATH := .
TEST_OBJ_PATH := .
# Gather all local object files.
TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
$(TEST_OBJ_PATH)/%.o, \
$(wildcard $(TEST_SRC_PATH)/*.c)))
# Override the value of CINCFLAGS so that the value of CFLAGS returned by
# get-frame-cflags-for() is not cluttered up with include paths needed only
# while building BLIS.
CINCFLAGS := -I$(INC_PATH)
# Use the "framework" CFLAGS for the configuration family.
CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
# Add local header paths to CFLAGS.
CFLAGS += -I$(TEST_SRC_PATH)
# Locate the libblis library to which we will link.
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
# Define a set of CFLAGS for use with C++ and Eigen.
CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS))
CXXFLAGS += -I$(EIGEN_INC)
# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS))
CXXFLAGS_MT := -march=native $(CXXFLAGS)
# Single or multithreaded string
STR_ST := -DTHR_STR=\"st\"
STR_MT := -DTHR_STR=\"mt\"
# Number of trials per problem size.
N_TRIALS := -DN_TRIALS=3
# Problem size specification
PDEF_ST := -DP_BEGIN=$(PS_BEGIN) \
-DP_MAX=$(PS_MAX) \
-DP_INC=$(PS_INC)
PDEF_MT := -DP_BEGIN=$(P1_BEGIN) \
-DP_MAX=$(P1_MAX) \
-DP_INC=$(P1_INC)
ifeq ($(E),1)
ERRCHK := -DERROR_CHECK
else
ERRCHK := -DNO_ERROR_CHECK
endif
# Enumerate possible datatypes and computation precisions.
#dts := s d c z
DTS := d
TRANS := n_n \
n_t \
t_n \
t_t
# While BLIS supports all combinations of row and column storage for matrices
# C, A, and B, the alternatives mostly only support CBLAS APIs, which inherently
# support only "all row-storage" or "all column-storage". Thus, we disable the
# building of those other drivers so that compilation/linking completes sooner.
#STORS := r_r_r \
# r_r_c \
# r_c_r \
# r_c_c \
# c_r_r \
# c_r_c \
# c_c_r \
# c_c_c
STORS := r_r_r \
c_c_c
SHAPES := l_l_s \
l_s_l \
s_l_l \
s_s_l \
s_l_s \
l_s_s \
l_l_l
SMS := 6
SNS := 8
SKS := 10
#
# --- Function definitions -----------------------------------------------------
#
# A function to strip the underscores from a list of strings.
stripu = $(subst _,,$(1))
# Various functions that help us construct the datatype combinations and then
# extract the needed datatype strings and C preprocessor define flags.
get-1of2 = $(word 1,$(subst _, ,$(1)))
get-2of2 = $(word 2,$(subst _, ,$(1)))
get-1of3 = $(word 1,$(subst _, ,$(1)))
get-2of3 = $(word 2,$(subst _, ,$(1)))
get-3of3 = $(word 3,$(subst _, ,$(1)))
# Datatype defs.
get-dt-cpp = $(strip \
$(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\
$(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\
$(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
-DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
# Transpose defs.
get-tra-defs-a = $(strip $(subst n,-DTRANSA=BLIS_NO_TRANSPOSE -DA_NOTRANS, \
$(subst t,-DTRANSA=BLIS_TRANSPOSE -DA_TRANS,$(call get-1of2,$(1)))))
get-tra-defs-b = $(strip $(subst n,-DTRANSB=BLIS_NO_TRANSPOSE -DB_NOTRANS, \
$(subst t,-DTRANSB=BLIS_TRANSPOSE -DB_TRANS,$(call get-2of2,$(1)))))
get-tra-defs = $(call get-tra-defs-a,$(1)) $(call get-tra-defs-b,$(1))
# Storage defs.
get-sto-uch-a = $(strip $(subst r,R, \
$(subst c,C,$(call get-1of3,$(1)))))
get-sto-uch-b = $(strip $(subst r,R, \
$(subst c,C,$(call get-2of3,$(1)))))
get-sto-uch-c = $(strip $(subst r,R, \
$(subst c,C,$(call get-3of3,$(1)))))
get-sto-defs = $(strip \
-DSTOR3=BLIS_$(call get-sto-uch-a,$(1))$(call get-sto-uch-b,$(1))$(call get-sto-uch-c,$(1)) \
-DA_STOR_$(call get-sto-uch-a,$(1)) \
-DB_STOR_$(call get-sto-uch-b,$(1)) \
-DC_STOR_$(call get-sto-uch-c,$(1)))
# Dimension defs.
get-shape-defs-cm = $(if $(findstring l,$(1)),-DM_DIM=-1,-DM_DIM=$(2))
get-shape-defs-cn = $(if $(findstring l,$(1)),-DN_DIM=-1,-DN_DIM=$(2))
get-shape-defs-ck = $(if $(findstring l,$(1)),-DK_DIM=-1,-DK_DIM=$(2))
get-shape-defs-m = $(call get-shape-defs-cm,$(call get-1of3,$(1)),$(2))
get-shape-defs-n = $(call get-shape-defs-cn,$(call get-2of3,$(1)),$(2))
get-shape-defs-k = $(call get-shape-defs-ck,$(call get-3of3,$(1)),$(2))
# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk
get-shape-defs = $(strip $(call get-shape-defs-m,$(1),$(2)) \
$(call get-shape-defs-n,$(1),$(3)) \
$(call get-shape-defs-k,$(1),$(4)))
#$(error l_l_s 6 8 4 = $(call get-shape-defs,l_l_s,6,8,4))
# Shape-dimension string.
get-shape-str-ch = $(if $(findstring l,$(1)),p,$(2))
get-shape-str-m = $(call get-shape-str-ch,$(call get-1of3,$(1)),$(2))
get-shape-str-n = $(call get-shape-str-ch,$(call get-2of3,$(1)),$(2))
get-shape-str-k = $(call get-shape-str-ch,$(call get-3of3,$(1)),$(2))
# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk
get-shape-dim-str = m$(call get-shape-str-m,$(1),$(2))n$(call get-shape-str-n,$(1),$(3))k$(call get-shape-str-k,$(1),$(4))
# Implementation defs.
# Define a function to return the appropriate -DSTR= and -D[BLIS|BLAS] flags.
get-imp-defs = $(strip $(subst blissup,-DSTR=\"$(1)\" -DBLIS -DSUP, \
$(subst blislpab,-DSTR=\"$(1)\" -DBLIS, \
$(subst eigen,-DSTR=\"$(1)\" -DEIGEN, \
$(subst openblas,-DSTR=\"$(1)\" -DCBLAS, \
$(subst blasfeo,-DSTR=\"$(1)\" -DCBLAS, \
$(subst libxsmm,-DSTR=\"$(1)\" -DBLAS -DXSMM, \
$(subst vendor,-DSTR=\"$(1)\" -DCBLAS,$(1)))))))))
TRANS0 = $(call stripu,$(TRANS))
STORS0 = $(call stripu,$(STORS))
# Limit BLAS and Eigen to only using all row-stored, or all column-stored matrices.
# Also, limit libxsmm to using all column-stored matrices since it does not offer
# CBLAS interfaces.
BSTORS0 = rrr ccc
ESTORS0 = rrr ccc
XSTORS0 = ccc
#
# --- Object and binary file definitons ----------------------------------------
#
get-st-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_st.o)))))))
# Build a list of object files and binaries for each single-threaded
# implementation using the get-st-objs() function defined above.
BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup)
BLISSUP_ST_BINS := $(patsubst %.o,%.x,$(BLISSUP_ST_OBJS))
BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab)
BLISLPAB_ST_BINS := $(patsubst %.o,%.x,$(BLISLPAB_ST_OBJS))
EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen)
EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas)
OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo)
BLASFEO_ST_BINS := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS))
LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm)
LIBXSMM_ST_BINS := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS))
VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor)
VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
# Mark the object files as intermediate so that make will remove them
# automatically after building the binaries on which they depend.
.INTERMEDIATE: $(BLISSUP_ST_OBJS) \
$(BLISLPAB_ST_OBJS) \
$(EIGEN_ST_OBJS) \
$(OPENBLAS_ST_OBJS) \
$(BLASFEO_ST_OBJS) \
$(LIBXSMM_ST_OBJS) \
$(VENDOR_ST_OBJS)
get-mt-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_mt.o)))))))
# Build a list of object files and binaries for each multithreaded
# implementation using the get-st-objs() function defined above.
BLISSUP_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup)
BLISSUP_MT_BINS := $(patsubst %.o,%.x,$(BLISSUP_MT_OBJS))
BLISLPAB_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab)
BLISLPAB_MT_BINS := $(patsubst %.o,%.x,$(BLISLPAB_MT_OBJS))
EIGEN_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen)
EIGEN_MT_BINS := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS))
OPENBLAS_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas)
OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS))
VENDOR_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor)
VENDOR_MT_BINS := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS))
#$(error "objs = $(EIGEN_ST_BINS)" )
# Mark the object files as intermediate so that make will remove them
# automatically after building the binaries on which they depend.
.INTERMEDIATE: $(BLISSUP_MT_OBJS) \
$(BLISLPAB_MT_OBJS) \
$(EIGEN_MT_OBJS) \
$(OPENBLAS_MT_OBJS) \
$(VENDOR_MT_OBJS)
#
# --- Targets/rules ------------------------------------------------------------
#
all: st
blis: blissup-st blislpab-st
blissup: blissup-st
blislpab: blislpab-st
eigen: eigen-st
openblas: openblas-st
blasfeo: blasfeo-st
libxsmm: libxsmm-st
vendor: vendor-st
st: blissup-st blislpab-st \
eigen-st openblas-st blasfeo-st libxsmm-st vendor-st
blissup-st: $(BLISSUP_ST_BINS)
blislpab-st: $(BLISLPAB_ST_BINS)
eigen-st: $(EIGEN_ST_BINS)
openblas-st: $(OPENBLAS_ST_BINS)
blasfeo-st: $(BLASFEO_ST_BINS)
libxsmm-st: $(LIBXSMM_ST_BINS)
vendor-st: $(VENDOR_ST_BINS)
mt: blissup-mt blislpab-mt \
eigen-mt openblas-mt vendor-mt
blissup-mt: $(BLISSUP_MT_BINS)
blislpab-mt: $(BLISLPAB_MT_BINS)
eigen-mt: $(EIGEN_MT_BINS)
openblas-mt: $(OPENBLAS_MT_BINS)
vendor-mt: $(VENDOR_MT_BINS)
# --Object file rules --
# Define the implementations for which we will instantiate compilation rules.
BIMPLS_ST := blissup blislpab openblas blasfeo libxsmm vendor
BIMPLS_MT := blissup blislpab openblas vendor
EIMPLS := eigen
# 1 2 3 4 567 8
# test_dgemm_nn_rrr_mpn6kp_blissup_st.x
# Define the function that will be used to instantiate compilation rules
# for the various single-threaded implementations.
define make-st-rule
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile
$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@
endef
# Instantiate the rule function make-st-rule() for each BLIS/BLAS/CBLAS
# implementation.
$(foreach dt,$(DTS), \
$(foreach tr,$(TRANS), \
$(foreach st,$(STORS), \
$(foreach sh,$(SHAPES), \
$(foreach sm,$(SMS), \
$(foreach sn,$(SNS), \
$(foreach sk,$(SKS), \
$(foreach impl,$(BIMPLS_ST), \
$(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
# Define the function that will be used to instantiate compilation rules
# for the various multithreaded implementations.
define make-mt-rule
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
endef
# Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS
# implementation.
$(foreach dt,$(DTS), \
$(foreach tr,$(TRANS), \
$(foreach st,$(STORS), \
$(foreach sh,$(SHAPES), \
$(foreach sm,$(SMS), \
$(foreach sn,$(SNS), \
$(foreach sk,$(SKS), \
$(foreach impl,$(BIMPLS_MT), \
$(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
# Define the function that will be used to instantiate compilation rules
# for the single-threaded Eigen implementation.
define make-eigst-rule
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile
$(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@
endef
# Instantiate the rule function make-st-rule() for each Eigen implementation.
$(foreach dt,$(DTS), \
$(foreach tr,$(TRANS), \
$(foreach st,$(STORS), \
$(foreach sh,$(SHAPES), \
$(foreach sm,$(SMS), \
$(foreach sn,$(SNS), \
$(foreach sk,$(SKS), \
$(foreach impl,$(EIMPLS), \
$(eval $(call make-eigst-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
# Define the function that will be used to instantiate compilation rules
# for the multithreaded Eigen implementation.
define make-eigmt-rule
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
$(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
endef
# Instantiate the rule function make-st-rule() for each Eigen implementation.
$(foreach dt,$(DTS), \
$(foreach tr,$(TRANS), \
$(foreach st,$(STORS), \
$(foreach sh,$(SHAPES), \
$(foreach sm,$(SMS), \
$(foreach sn,$(SNS), \
$(foreach sk,$(SKS), \
$(foreach impl,$(EIMPLS), \
$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
# -- Executable file rules --
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
# on the link command line in case BLIS was configured with the BLAS
# compatibility layer. This prevents BLIS from inadvertently getting called
# for the BLAS routines we are trying to test with.
test_%_blissup_st.x: test_%_blissup_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_blislpab_st.x: test_%_blislpab_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_eigen_st.x: test_%_eigen_st.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_blasfeo_st.x: test_%_blasfeo_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(BLASFEO_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_libxsmm_st.x: test_%_libxsmm_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBXSMM_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_blissup_mt.x: test_%_blissup_mt.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_blislpab_mt.x: test_%_blislpab_mt.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_eigen_mt.x: test_%_eigen_mt.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_vendor_mt.x: test_%_vendor_mt.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
# -- Clean rules --
clean: cleanx
cleanx:
- $(RM_F) *.x *.o

View File

@@ -0,0 +1,52 @@
function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims )
nops = size( ops, 1 );
smallm = smalldims( 1 );
smalln = smalldims( 2 );
smallk = smalldims( 3 );
i = 1;
for io = 1:nops
op = ops( io, : );
str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm );
str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln );
str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk );
str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk );
str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk );
str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln );
str6 = sprintf( '%s_%s_mpnpkp', op, stor );
%opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm )
%opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln )
%opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk )
%opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk )
%opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk )
%opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln )
%opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor )
opsupnames( i+0, : ) = sprintf( '%-20s', str0 );
opsupnames( i+1, : ) = sprintf( '%-20s', str1 );
opsupnames( i+2, : ) = sprintf( '%-20s', str2 );
opsupnames( i+3, : ) = sprintf( '%-20s', str3 );
opsupnames( i+4, : ) = sprintf( '%-20s', str4 );
opsupnames( i+5, : ) = sprintf( '%-20s', str5 );
opsupnames( i+6, : ) = sprintf( '%-20s', str6 );
opnames( i+0, : ) = sprintf( '%s', op );
opnames( i+1, : ) = sprintf( '%s', op );
opnames( i+2, : ) = sprintf( '%s', op );
opnames( i+3, : ) = sprintf( '%s', op );
opnames( i+4, : ) = sprintf( '%s', op );
opnames( i+5, : ) = sprintf( '%s', op );
opnames( i+6, : ) = sprintf( '%s', op );
i = i + 7;
end
r_val1 = opsupnames;
r_val2 = opnames;

View File

@@ -0,0 +1,250 @@
function r_val = plot_l3sup_perf( opname, ...
data_blissup, ...
data_blislpab, ...
data_eigen, ...
data_open, ...
data_vend, vend_str, ...
nth, ...
rows, cols, ...
cfreq, ...
dfps, ...
theid, impl )
%if ... %mod(theid-1,cols) == 2 || ...
% ... %mod(theid-1,cols) == 3 || ...
% ... %mod(theid-1,cols) == 4 || ...
% 0 == 1 ... %theid >= 19
% show_plot = 0;
%else
show_plot = 1;
%end
%legend_plot_id = 11;
legend_plot_id = 1*cols + 1*5;
if 1
ax1 = subplot( rows, cols, theid );
hold( ax1, 'on' );
end
% Set line properties.
color_blissup = 'k'; lines_blissup = '-'; markr_blissup = '';
color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = '';
color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o';
color_open = 'r'; lines_open = '--'; markr_open = 'o';
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
% Compute the peak performance in terms of the number of double flops
% executable per cycle and the clock rate.
if opname(1) == 's' || opname(1) == 'c'
flopspercycle = dfps * 2;
else
flopspercycle = dfps;
end
max_perf_core = (flopspercycle * cfreq) * 1;
% Escape underscores in the title.
title_opname = strrep( opname, '_', '\_' );
% Print the title to a string.
titlename = '%s';
titlename = sprintf( titlename, title_opname );
% Set the legend strings.
blissup_legend = sprintf( 'BLIS sup' );
blislpab_legend = sprintf( 'BLIS conv' );
eigen_legend = sprintf( 'Eigen' );
open_legend = sprintf( 'OpenBLAS' );
%vend_legend = sprintf( 'MKL' );
%vend_legend = sprintf( 'ARMPL' );
vend_legend = vend_str;
% Set axes range values.
y_scale = 1.00;
x_begin = 0;
%x_end is set below.
y_begin = 0;
y_end = max_perf_core * y_scale;
% Set axes names.
if nth == 1
yaxisname = 'GFLOPS';
else
yaxisname = 'GFLOPS/core';
end
%flopscol = 4;
flopscol = size( data_blissup, 2 );
msize = 5;
if 1
fontsize = 11;
else
fontsize = 16;
end
linesize = 0.5;
legend_loc = 'southeast';
% --------------------------------------------------------------------
% Automatically detect a column with the increasing problem size.
% Then set the maximum x-axis value.
for psize_col = 1:3
if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col )
break;
end
end
x_axis( :, 1 ) = data_blissup( :, psize_col );
% Compute the number of data points we have in the x-axis. Note that
% we only use quarter the data points for the m = n = k column of graphs.
if mod(theid-1,cols) == 6
np = size( data_blissup, 1 ) / 4;
else
np = size( data_blissup, 1 );
end
% Grab the last x-axis value.
x_end = data_blissup( np, psize_col );
%data_peak( 1, 1:2 ) = [ 0 max_perf_core ];
%data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
if show_plot == 1
blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
'LineWidth',linesize );
eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
'Color',color_eigen, 'LineStyle',lines_eigen, ...
'LineWidth',linesize );
open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
'Color',color_open, 'LineStyle',lines_open, ...
'LineWidth',linesize );
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
else
if theid == legend_plot_id
blissup_ln = line( nan, nan, ...
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
blislpab_ln = line( nan, nan, ...
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
'LineWidth',linesize );
eigen_ln = line( nan, nan, ...
'Color',color_eigen, 'LineStyle',lines_eigen, ...
'LineWidth',linesize );
open_ln = line( nan, nan, ...
'Color',color_open, 'LineStyle',lines_open, ...
'LineWidth',linesize );
vend_ln = line( nan, nan, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
end
end
xlim( ax1, [x_begin x_end] );
ylim( ax1, [y_begin y_end] );
if 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 4000 <= x_end && x_end < 6000
x_tick2 = x_end - 1000;
x_tick1 = x_tick2/2;
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 2000 <= x_end && x_end < 3000
x_tick2 = x_end - 400;
x_tick1 = x_tick2/2;
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 500 <= x_end && x_end < 1000
x_tick3 = x_end*(3/4);
x_tick2 = x_end*(2/4);
x_tick1 = x_end*(1/4);
xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] );
end
if show_plot == 1 || theid == legend_plot_id
if theid == legend_plot_id
leg = legend( ...
[ ...
blissup_ln ...
blislpab_ln ...
eigen_ln ...
open_ln ...
vend_ln ...
], ...
blissup_legend, ...
blislpab_legend, ...
eigen_legend, ...
open_legend, ...
vend_legend, ...
'Location', legend_loc );
set( leg,'Box','off' );
set( leg,'Color','none' );
set( leg,'Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
set( leg,'Position',[12.50 10.35 1.5 0.9 ] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
end
set( leg,'Box','off' );
set( leg,'Color','none' );
set( leg,'Units','inches' );
% xpos ypos
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
end
end
set( ax1,'FontSize',fontsize );
set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1.
box( ax1, 'on' );
titl = title( titlename );
set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
if impl == 'octave'
tpos = get( titl, 'Position' ); % default is to align across whole figure, not box.
tpos(1) = tpos(1) + -40;
set( titl, 'Position', tpos ); % here we nudge it back to centered with box.
end
if theid > (rows-1)*cols
%xlab = xlabel( ax1,xaxisname );
%tpos = get( xlab, 'Position' )
%tpos(2) = tpos(2) + 10;
%set( xlab, 'Position', tpos );
if theid == rows*cols - 6
xlab = xlabel( ax1, 'm = 6; n = k' );
elseif theid == rows*cols - 5
xlab = xlabel( ax1, 'n = 8; m = k' );
elseif theid == rows*cols - 4
xlab = xlabel( ax1, 'k = 10; m = n' );
elseif theid == rows*cols - 3
xlab = xlabel( ax1, 'm; n = 8, k = 10' );
elseif theid == rows*cols - 2
xlab = xlabel( ax1, 'n; m = 6, k = 10' );
elseif theid == rows*cols - 1
xlab = xlabel( ax1, 'k; m = 6, n = 8' );
elseif theid == rows*cols - 0
xlab = xlabel( ax1, 'm = n = k' );
end
end
if mod(theid-1,cols) == 0
ylab = ylabel( ax1,yaxisname );
end
%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' );
%saveas( fig, filename_png );
%hold( ax1, 'off' );
r_val = 0;

View File

@@ -0,0 +1,152 @@
function r_val = plot_panel_trxsh ...
( ...
cfreq, ...
dflopspercycle, ...
nth, ...
thr_str, ...
dt_ch, ...
stor_str, ...
smalldims, ...
dirpath, ...
arch_str, ...
vend_str, ...
impl ...
)
%cfreq = 1.8;
%dflopspercycle = 32;
% Create filename "templates" for the files that contain the performance
% results.
filetemp_blissup = '%s/output_%s_%s_blissup.m';
filetemp_blislpab = '%s/output_%s_%s_blislpab.m';
filetemp_eigen = '%s/output_%s_%s_eigen.m';
filetemp_open = '%s/output_%s_%s_openblas.m';
filetemp_vend = '%s/output_%s_%s_vendor.m';
% Create a variable name "template" for the variables contained in the
% files outlined above.
vartemp = 'data_%s_%s_%s( :, : )';
% Define the datatypes and operations we will be plotting.
oproot = sprintf( '%cgemm', dt_ch );
ops( 1, : ) = sprintf( '%s_nn', oproot );
ops( 2, : ) = sprintf( '%s_nt', oproot );
ops( 3, : ) = sprintf( '%s_tn', oproot );
ops( 4, : ) = sprintf( '%s_tt', oproot );
% Generate datatype-specific operation names from the set of operations
% and datatypes.
[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims );
n_opsupnames = size( opsupnames, 1 );
%opsupnames
%opnames
%return
if 1 == 1
%fig = figure('Position', [100, 100, 2400, 1500]);
fig = figure('Position', [100, 100, 2400, 1200]);
orient( fig, 'portrait' );
set(gcf,'PaperUnits', 'inches');
if impl == 'matlab'
set(gcf,'PaperSize', [11.5 20.4]);
set(gcf,'PaperPosition', [0 0 11.5 20.4]);
set(gcf,'PaperPositionMode','manual');
else % impl == 'octave' % octave 4.x
set(gcf,'PaperSize', [12 21.5]);
set(gcf,'PaperPositionMode','auto');
end
set(gcf,'PaperOrientation','landscape');
end
% Iterate over the list of datatype-specific operation names.
for opi = 1:n_opsupnames
%for opi = 1:1
% Grab the current datatype combination.
opsupname = opsupnames( opi, : );
opname = opnames( opi, : );
opsupname = strtrim( opsupname );
opname = strtrim( opname );
str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str);
% Construct filenames for the data files from templates.
file_blissup = sprintf( filetemp_blissup, dirpath, thr_str, opsupname );
file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname );
file_eigen = sprintf( filetemp_eigen, dirpath, thr_str, opsupname );
file_open = sprintf( filetemp_open, dirpath, thr_str, opsupname );
file_vend = sprintf( filetemp_vend, dirpath, thr_str, opsupname );
% Load the data files.
%str = sprintf( ' Loading %s', file_blissup ); disp(str);
run( file_blissup )
run( file_blislpab )
run( file_eigen )
run( file_open )
run( file_vend )
% Construct variable names for the variables in the data files.
var_blissup = sprintf( vartemp, thr_str, opname, 'blissup' );
var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' );
var_eigen = sprintf( vartemp, thr_str, opname, 'eigen' );
var_open = sprintf( vartemp, thr_str, opname, 'openblas' );
var_vend = sprintf( vartemp, thr_str, opname, 'vendor' );
% Use eval() to instantiate the variable names constructed above,
% copying each to a simplified name.
data_blissup = eval( var_blissup ); % e.g. data_st_dgemm_blissup( :, : );
data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : );
data_eigen = eval( var_eigen ); % e.g. data_st_dgemm_eigen( :, : );
data_open = eval( var_open ); % e.g. data_st_dgemm_openblas( :, : );
data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : );
%str = sprintf( ' Reading %s', var_blissup ); disp(str);
%str = sprintf( ' Reading %s', var_blislpab ); disp(str);
%str = sprintf( ' Reading %s', var_eigen ); disp(str);
%str = sprintf( ' Reading %s', var_open ); disp(str);
%str = sprintf( ' Reading %s', var_bfeo ); disp(str);
%str = sprintf( ' Reading %s', var_xsmm ); disp(str);
%str = sprintf( ' Reading %s', var_vend ); disp(str);
% Plot one result in an m x n grid of plots, via the subplot()
% function.
if 1 == 1
plot_l3sup_perf( opsupname, ...
data_blissup, ...
data_blislpab, ...
data_eigen, ...
data_open, ...
data_vend, vend_str, ...
nth, ...
4, 7, ...
cfreq, ...
dflopspercycle, ...
opi, impl );
clear data_mt_*gemm_*;
clear data_blissup;
clear data_blislpab;
clear data_eigen;
clear data_open;
clear data_vend;
end
end
% Construct the name of the file to which we will output the graph.
outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth );
% Output the graph to pdf format.
%print(gcf, 'gemm_md','-fillpage','-dpdf');
%print(gcf, outfile,'-bestfit','-dpdf');
if impl == 'octave'
print(gcf, outfile);
else % if impl == 'matlab'
print(gcf, outfile,'-bestfit','-dpdf');
end

View File

@@ -0,0 +1,12 @@
% haswell
plot_panel_trxsh(3.25,16,1,'mt','d','ccc',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
plot_panel_trxsh(3.25,16,1,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
% kabylake
plot_panel_trxsh(3.80,16,1,'mt','d','rrr',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all;
plot_panel_trxsh(3.80,16,1,'mt','d','ccc',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all;
% epyc
plot_panel_trxsh(3.00, 8,1,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
plot_panel_trxsh(3.00, 8,1,'mt','d','ccc',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;

188
test/supmt/runme.sh Executable file
View File

@@ -0,0 +1,188 @@
#!/bin/bash
# File pefixes.
exec_root="test"
out_root="output"
sys="blis"
#sys="lonestar5"
#sys="ul252"
#sys="ul264"
if [ ${sys} = "blis" ]; then
export GOMP_CPU_AFFINITY="0-3"
nt=4
elif [ ${sys} = "lonestar5" ]; then
export GOMP_CPU_AFFINITY="0-23"
nt=24
elif [ ${sys} = "ul252" ]; then
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
export GOMP_CPU_AFFINITY="0-51"
nt=52
elif [ ${sys} = "ul264" ]; then
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
export GOMP_CPU_AFFINITY="0-63"
nt=64
fi
# Delay between test cases.
delay=0.02
# Threadedness to test.
threads="mt"
# Datatypes to test.
#dts="d s"
dts="d"
# Operations to test.
ops="gemm"
# Transpose combintions to test.
trans="nn nt tn tt"
# Storage combinations to test.
#stors="rrr rrc rcr rcc crr crc ccr ccc"
stors="rrr ccc"
# Problem shapes to test.
shapes="sll lsl lls lss sls ssl lll"
# FGVZ: figure out how to probe what's in the directory and
# execute everything that's there?
sms="6"
sns="8"
sks="10"
# Implementations to test.
impls="vendor blissup blislpab openblas eigen"
#impls="vendor"
#impls="blissup"
#impls="blislpab"
#impls="openblas"
#impls="eigen"
# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
# restore the value.
GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
# Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x
for th in ${threads}; do
for dt in ${dts}; do
for op in ${ops}; do
for tr in ${trans}; do
for st in ${stors}; do
for sh in ${shapes}; do
for sm in ${sms}; do
for sn in ${sns}; do
for sk in ${sks}; do
for im in ${impls}; do
if [ "${im:0:4}" = "blis" ]; then
unset OMP_NUM_THREADS
export BLIS_NUM_THREADS=${nt}
elif [ "${im}" = "openblas" ]; then
unset OMP_NUM_THREADS
export OPENBLAS_NUM_THREADS=${nt}
elif [ "${im}" = "eigen" ]; then
export OMP_NUM_THREADS=${nt}
elif [ "${im}" = "vendor" ]; then
unset OMP_NUM_THREADS
export MKL_NUM_THREADS=${nt}
fi
# Multithreaded OpenBLAS seems to have a problem
# running properly if GOMP_CPU_AFFINITY is set.
# So we temporarily unset it here if we are about
# to execute OpenBLAS, but otherwise restore it.
if [ ${im} = "openblas" ]; then
unset GOMP_CPU_AFFINITY
else
export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
fi
# Limit execution of non-BLIS implementations to
# rrr/ccc storage cases.
if [ "${im:0:4}" != "blis" ] && \
[ "${st}" != "rrr" ] && \
[ "${st}" != "ccc" ]; then
continue;
fi
# Further limit execution of libxsmm to
# ccc storage cases.
if [ "${im:0:7}" = "libxsmm" ] && \
[ "${st}" != "ccc" ]; then
continue;
fi
# Extract the shape chars for m, n, k.
chm=${sh:0:1}
chn=${sh:1:1}
chk=${sh:2:1}
# Construct the shape substring (e.g. m6npkp)
shstr=""
if [ ${chm} = "s" ]; then
shstr="${shstr}m${sm}"
else
shstr="${shstr}mp"
fi
if [ ${chn} = "s" ]; then
shstr="${shstr}n${sn}"
else
shstr="${shstr}np"
fi
if [ ${chk} = "s" ]; then
shstr="${shstr}k${sk}"
else
shstr="${shstr}kp"
fi
# Ex: test_dgemm_nn_rrc_m6npkp_blissup_st.x
# Construct the name of the test executable.
exec_name="${exec_root}_${dt}${op}_${tr}_${st}_${shstr}_${im}_${th}.x"
# Construct the name of the output file.
out_file="${out_root}_${th}_${dt}${op}_${tr}_${st}_${shstr}_${im}.m"
echo "Running (nt = ${nt}) ./${exec_name} > ${out_file}"
# Run executable.
./${exec_name} > ${out_file}
sleep ${delay}
done
done
done
done
done
done
done
done
done
done

589
test/supmt/test_gemm.c Normal file
View File

@@ -0,0 +1,589 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#ifdef EIGEN
#define BLIS_DISABLE_BLAS_DEFS
#include "blis.h"
#include <Eigen/Core>
//#include <Eigen/src/misc/blas.h>
using namespace Eigen;
#else
#include "blis.h"
#endif
//#define PRINT
int main( int argc, char** argv )
{
rntm_t rntm_g;
bli_init();
// Copy the global rntm_t object in case we need it later when disabling
// sup.
bli_rntm_init_from_global( &rntm_g );
#ifndef ERROR_CHECK
bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
#endif
dim_t n_trials = N_TRIALS;
num_t dt = DT;
#if 1
dim_t p_begin = P_BEGIN;
dim_t p_max = P_MAX;
dim_t p_inc = P_INC;
#else
dim_t p_begin = 4;
dim_t p_max = 40;
dim_t p_inc = 4;
#endif
#if 1
dim_t m_input = M_DIM;
dim_t n_input = N_DIM;
dim_t k_input = K_DIM;
#else
p_begin = p_inc = 32;
dim_t m_input = 6;
dim_t n_input = -1;
dim_t k_input = -1;
#endif
#if 1
trans_t transa = TRANSA;
trans_t transb = TRANSB;
#else
trans_t transa = BLIS_NO_TRANSPOSE;
trans_t transb = BLIS_NO_TRANSPOSE;
#endif
#if 1
stor3_t sc = STOR3;
#else
stor3_t sc = BLIS_RRR;
#endif
inc_t rs_c, cs_c;
inc_t rs_a, cs_a;
inc_t rs_b, cs_b;
if ( sc == BLIS_RRR ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = -1; }
else if ( sc == BLIS_RRC ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = 0; }
else if ( sc == BLIS_RCR ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = -1; }
else if ( sc == BLIS_RCC ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = 0; }
else if ( sc == BLIS_CRR ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = -1; }
else if ( sc == BLIS_CRC ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = 0; }
else if ( sc == BLIS_CCR ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = -1; }
else if ( sc == BLIS_CCC ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = 0; }
else { bli_abort(); }
f77_int cbla_storage;
if ( sc == BLIS_RRR ) cbla_storage = CblasRowMajor;
else if ( sc == BLIS_CCC ) cbla_storage = CblasColMajor;
else cbla_storage = -1;
( void )cbla_storage;
char dt_ch;
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
f77_char f77_transa;
f77_char f77_transb;
char transal, transbl;
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
transal = tolower( f77_transa );
transbl = tolower( f77_transb );
f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans );
f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans );
( void )cbla_transa;
( void )cbla_transb;
dim_t p;
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
transal, transbl, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0,
( unsigned long )0,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_max; p += p_inc )
for ( p = p_max; p_begin <= p; p -= p_inc )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, n, k;
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
bli_obj_create( dt, m, n, rs_c, cs_c, &c );
bli_obj_create( dt, m, n, rs_c, cs_c, &c_save );
if ( bli_does_notrans( transa ) )
bli_obj_create( dt, m, k, rs_a, cs_a, &a );
else
bli_obj_create( dt, k, m, rs_a, cs_a, &a );
if ( bli_does_notrans( transb ) )
bli_obj_create( dt, k, n, rs_b, cs_b, &b );
else
bli_obj_create( dt, n, k, rs_b, cs_b, &b );
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_conjtrans( transb, &b );
bli_setsc( (1.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#ifdef EIGEN
double alpha_r, alpha_i;
bli_getsc( &alpha, &alpha_r, &alpha_i );
void* ap = bli_obj_buffer_at_off( &a );
void* bp = bli_obj_buffer_at_off( &b );
void* cp = bli_obj_buffer_at_off( &c );
const int os_a = ( bli_obj_is_col_stored( &a ) ? bli_obj_col_stride( &a )
: bli_obj_row_stride( &a ) );
const int os_b = ( bli_obj_is_col_stored( &b ) ? bli_obj_col_stride( &b )
: bli_obj_row_stride( &b ) );
const int os_c = ( bli_obj_is_col_stored( &c ) ? bli_obj_col_stride( &c )
: bli_obj_row_stride( &c ) );
Stride<Dynamic,1> stride_a( os_a, 1 );
Stride<Dynamic,1> stride_b( os_b, 1 );
Stride<Dynamic,1> stride_c( os_c, 1 );
#if defined(IS_FLOAT)
#elif defined (IS_DOUBLE)
#ifdef A_STOR_R
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_A;
#else
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_A;
#endif
#ifdef B_STOR_R
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_B;
#else
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_B;
#endif
#ifdef C_STOR_R
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_C;
#else
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_C;
#endif
#ifdef A_NOTRANS // A is not transposed
Map<MatrixXd_A, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
#else // A is transposed
Map<MatrixXd_A, 0, Stride<Dynamic,1> > A( ( double* )ap, k, m, stride_a );
#endif
#ifdef B_NOTRANS // B is not transposed
Map<MatrixXd_B, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
#else // B is transposed
Map<MatrixXd_B, 0, Stride<Dynamic,1> > B( ( double* )bp, n, k, stride_b );
#endif
Map<MatrixXd_C, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
#endif
#endif
double dtime_save = DBL_MAX;
for ( dim_t r = 0; r < n_trials; ++r )
{
bli_copym( &c_save, &c );
double dtime = bli_clock();
#ifdef EIGEN
#ifdef A_NOTRANS
#ifdef B_NOTRANS
C.noalias() += alpha_r * A * B;
#else // B_TRANS
C.noalias() += alpha_r * A * B.transpose();
#endif
#else // A_TRANS
#ifdef B_NOTRANS
C.noalias() += alpha_r * A.transpose() * B;
#else // B_TRANS
C.noalias() += alpha_r * A.transpose() * B.transpose();
#endif
#endif
#endif
#ifdef BLIS
#ifdef SUP
// Allow sup.
bli_gemm( &alpha,
&a,
&b,
&beta,
&c );
#else
// Disable sup and use the expert interface.
//rntm_t rntm = BLIS_RNTM_INITIALIZER;
rntm_t rntm = rntm_g;
bli_rntm_disable_l3_sup( &rntm );
bli_gemm_ex( &alpha,
&a,
&b,
&beta,
&c, NULL, &rntm );
#endif
#endif
#ifdef BLAS
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = ( float* )bli_obj_buffer( &alpha );
float* ap = ( float* )bli_obj_buffer( &a );
float* bp = ( float* )bli_obj_buffer( &b );
float* betap = ( float* )bli_obj_buffer( &beta );
float* cp = ( float* )bli_obj_buffer( &c );
#ifdef XSMM
libxsmm_sgemm( &f77_transa,
#else
sgemm_( &f77_transa,
#endif
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = ( double* )bli_obj_buffer( &alpha );
double* ap = ( double* )bli_obj_buffer( &a );
double* bp = ( double* )bli_obj_buffer( &b );
double* betap = ( double* )bli_obj_buffer( &beta );
double* cp = ( double* )bli_obj_buffer( &c );
#ifdef XSMM
libxsmm_dgemm( &f77_transa,
#else
dgemm_( &f77_transa,
#endif
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
#ifdef XSMM
libxsmm_cgemm( &f77_transa,
#else
cgemm_( &f77_transa,
#endif
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
#ifdef XSMM
libxsmm_zgemm( &f77_transa,
#else
zgemm_( &f77_transa,
#endif
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef CBLAS
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
cblas_sgemm( cbla_storage,
cbla_transa,
cbla_transb,
mm,
nn,
kk,
*alphap,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
cblas_dgemm( cbla_storage,
cbla_transa,
cbla_transb,
mm,
nn,
kk,
*alphap,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
cblas_cgemm( cbla_storage,
cbla_transa,
cbla_transb,
mm,
nn,
kk,
alphap,
ap, lda,
bp, ldb,
betap,
cp, ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
cblas_zgemm( cbla_storage,
cbla_transa,
cbla_transb,
mm,
nn,
kk,
alphap,
ap, lda,
bp, ldb,
betap,
cp, ldc );
}
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
double gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
transal, transbl, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )n,
( unsigned long )k, gflops );
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}

View File

@@ -2109,6 +2109,10 @@ void libblis_test_op_driver
// Loop over the requested storage schemes.
for ( sci = 0; sci < n_store_combos; ++sci )
//for ( sci = 0; sci < 5; ( sci == 2 ? sci+=2 : ++sci ) )
//for ( sci = 3; sci < 8; ( sci == 3 ? sci+=2 : ++sci ) )
//for ( sci = 0; sci < 1; ++sci )
//for ( sci = 7; sci < 8; ++sci )
{
// Loop over the requested datatypes.
for ( dci = 0; dci < n_dt_combos; ++dci )