Merge master code 2016_10_13 Removed previously renamed/old files

Change-Id: I8106d371afaa0af474a8967388d44481b05de923
This commit is contained in:
praveeng
2016-10-13 12:02:28 +05:30
38 changed files with 1956 additions and 1014 deletions

1064
CHANGELOG

File diff suppressed because it is too large Load Diff

View File

@@ -34,12 +34,11 @@
#include "blis.h"
#if 0
thrinfo_t* bli_packm_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
@@ -51,7 +50,6 @@ thrinfo_t* bli_packm_thrinfo_create
(
thread,
ocomm, ocomm_id,
icomm, icomm_id,
n_way,
work_id,
FALSE,
@@ -60,14 +58,13 @@ thrinfo_t* bli_packm_thrinfo_create
return thread;
}
#endif
void bli_packm_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
@@ -77,7 +74,6 @@ void bli_packm_thrinfo_init
(
thread,
ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
FALSE,
sub_node
@@ -93,13 +89,13 @@ void bli_packm_thrinfo_init_single
(
thread,
&BLIS_SINGLE_COMM, 0,
&BLIS_SINGLE_COMM, 0,
1,
0,
NULL
);
}
#if 0
void bli_packm_thrinfo_free
(
thrinfo_t* thread
@@ -109,4 +105,4 @@ void bli_packm_thrinfo_free
thread != &BLIS_PACKM_SINGLE_THREADED )
bli_free_intl( thread );
}
#endif

View File

@@ -42,24 +42,22 @@
// thrinfo_t APIs specific to packm.
//
#if 0
thrinfo_t* bli_packm_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
);
#endif
void bli_packm_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
@@ -70,8 +68,10 @@ void bli_packm_thrinfo_init_single
thrinfo_t* thread
);
#if 0
void bli_packm_thrinfo_free
(
thrinfo_t* thread
);
#endif

View File

@@ -35,12 +35,11 @@
#include "blis.h"
#include "assert.h"
#if 0
thrinfo_t* bli_l3_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
@@ -49,21 +48,19 @@ thrinfo_t* bli_l3_thrinfo_create
return bli_thrinfo_create
(
ocomm, ocomm_id,
icomm, icomm_id,
n_way,
work_id,
TRUE,
sub_node
);
}
#endif
void bli_l3_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
@@ -73,7 +70,6 @@ void bli_l3_thrinfo_init
(
thread,
ocomm, ocomm_id,
icomm, icomm_id,
n_way,
work_id,
TRUE,
@@ -105,14 +101,12 @@ void bli_l3_thrinfo_free
// is marked as needing them to be freed. The most common example of
// thrinfo_t nodes NOT marked as needing their comms freed are those
// associated with packm thrinfo_t nodes.
if ( bli_thrinfo_needs_free_comms( thread ) )
if ( bli_thrinfo_needs_free_comm( thread ) )
{
// The ochief always frees his communicator, and the ichief free its
// communicator if we are at the leaf node.
if ( bli_thread_am_ochief( thread ) )
bli_thrcomm_free( bli_thrinfo_ocomm( thread ) );
if ( thrinfo_sub_node == NULL && bli_thread_am_ichief( thread ) )
bli_thrcomm_free( bli_thrinfo_icomm( thread ) );
}
// Free all children of the current thrinfo_t.
@@ -124,117 +118,208 @@ void bli_l3_thrinfo_free
// -----------------------------------------------------------------------------
//#define PRINT_THRINFO
thrinfo_t** bli_l3_thrinfo_create_paths
void bli_l3_thrinfo_create_root
(
opid_t l3_op,
side_t side
dim_t id,
thrcomm_t* gl_comm,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t** thread
)
{
dim_t jc_in, jc_way;
dim_t kc_in, kc_way;
dim_t ic_in, ic_way;
dim_t jr_in, jr_way;
dim_t ir_in, ir_way;
// Query the global communicator for the total number of threads to use.
dim_t n_threads = bli_thrcomm_num_threads( gl_comm );
#ifdef BLIS_ENABLE_MULTITHREADING
jc_in = bli_env_read_nway( "BLIS_JC_NT" );
//kc_way = bli_env_read_nway( "BLIS_KC_NT" );
kc_in = 1;
ic_in = bli_env_read_nway( "BLIS_IC_NT" );
jr_in = bli_env_read_nway( "BLIS_JR_NT" );
ir_in = bli_env_read_nway( "BLIS_IR_NT" );
#else
jc_in = 1;
kc_in = 1;
ic_in = 1;
jr_in = 1;
ir_in = 1;
#endif
// Use the thread id passed in as the global communicator id.
dim_t gl_comm_id = id;
if ( l3_op == BLIS_TRMM )
{
// We reconfigure the parallelism for trmm_r due to a dependency in
// the jc loop. (NOTE: This dependency does not exist for trmm3.)
if ( bli_is_right( side ) )
{
jc_way = 1;
kc_way = kc_in;
ic_way = ic_in;
jr_way = jr_in * jc_in;
ir_way = ir_in;
}
else // if ( bli_is_left( side ) )
{
jc_way = jc_in;
kc_way = kc_in;
ic_way = ic_in;
jr_way = jr_in;
ir_way = ir_in;
}
}
else if ( l3_op == BLIS_TRSM )
{
if ( bli_is_right( side ) )
{
// Use the blocksize id of the current (root) control tree node to
// query the top-most ways of parallelism to obtain.
bszid_t bszid = bli_cntl_bszid( cntl );
dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx );
jc_way = 1;
kc_way = 1;
ic_way = jc_in * ic_in * jr_in;
jr_way = 1;
ir_way = 1;
}
else // if ( bli_is_left( side ) )
{
jc_way = 1;
kc_way = 1;
ic_way = 1;
jr_way = ic_in * jr_in * ir_in;
ir_way = 1;
}
}
else // all other level-3 operations
// Determine the work id for this thrinfo_t node.
dim_t work_id = gl_comm_id / ( n_threads / xx_way );
// Create the root thrinfo_t node.
*thread = bli_thrinfo_create
(
gl_comm,
gl_comm_id,
xx_way,
work_id,
TRUE,
NULL
);
}
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_print_paths
(
thrinfo_t** threads
)
{
dim_t n_threads = bli_thread_num_threads( threads[0] );
dim_t gl_comm_id;
thrinfo_t* jc_info = threads[0];
thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info );
thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info );
thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info );
thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info );
thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info );
thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info );
dim_t jc_way = bli_thread_n_way( jc_info );
dim_t pc_way = bli_thread_n_way( pc_info );
dim_t pb_way = bli_thread_n_way( pb_info );
dim_t ic_way = bli_thread_n_way( ic_info );
dim_t pa_way = bli_thread_n_way( pa_info );
dim_t jr_way = bli_thread_n_way( jr_info );
dim_t ir_way = bli_thread_n_way( ir_info );
dim_t gl_nt = bli_thread_num_threads( jc_info );
dim_t jc_nt = bli_thread_num_threads( pc_info );
dim_t pc_nt = bli_thread_num_threads( pb_info );
dim_t pb_nt = bli_thread_num_threads( ic_info );
dim_t ic_nt = bli_thread_num_threads( pa_info );
dim_t pa_nt = bli_thread_num_threads( jr_info );
dim_t jr_nt = bli_thread_num_threads( ir_info );
printf( " gl jc kc pb ic pa jr ir\n" );
printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
gl_nt, jc_nt, pc_nt, pb_nt, ic_nt, pa_nt, jr_nt, (dim_t)1 );
printf( "\n" );
printf( " jc kc pb ic pa jr ir\n" );
printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
jc_way, pc_way, pb_way, ic_way, pa_way, jr_way, ir_way );
printf( "=================================================\n" );
for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id )
{
jc_way = jc_in;
kc_way = kc_in;
ic_way = ic_in;
jr_way = jr_in;
ir_way = ir_in;
jc_info = threads[gl_comm_id];
pc_info = bli_thrinfo_sub_node( jc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
ic_info = bli_thrinfo_sub_node( pb_info );
pa_info = bli_thrinfo_sub_node( ic_info );
jr_info = bli_thrinfo_sub_node( pa_info );
ir_info = bli_thrinfo_sub_node( jr_info );
dim_t gl_comm_id = bli_thread_ocomm_id( jc_info );
dim_t jc_comm_id = bli_thread_ocomm_id( pc_info );
dim_t pc_comm_id = bli_thread_ocomm_id( pb_info );
dim_t pb_comm_id = bli_thread_ocomm_id( ic_info );
dim_t ic_comm_id = bli_thread_ocomm_id( pa_info );
dim_t pa_comm_id = bli_thread_ocomm_id( jr_info );
dim_t jr_comm_id = bli_thread_ocomm_id( ir_info );
dim_t jc_work_id = bli_thread_work_id( jc_info );
dim_t pc_work_id = bli_thread_work_id( pc_info );
dim_t pb_work_id = bli_thread_work_id( pb_info );
dim_t ic_work_id = bli_thread_work_id( ic_info );
dim_t pa_work_id = bli_thread_work_id( pa_info );
dim_t jr_work_id = bli_thread_work_id( jr_info );
dim_t ir_work_id = bli_thread_work_id( ir_info );
printf( " gl jc pb kc pa ic jr \n" );
printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id );
printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n",
jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id );
printf( "---------------------------------------\n" );
}
}
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
// -----------------------------------------------------------------------------
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
dim_t kc_nt = ic_way * jr_way * ir_way;
#if 0
thrinfo_t** bli_l3_thrinfo_create_roots
(
cntx_t* cntx,
cntl_t* cntl
)
{
// Query the context for the total number of threads to use.
dim_t n_threads = bli_cntx_get_num_threads( cntx );
// Create a global thread communicator for all the threads.
thrcomm_t* gl_comm = bli_thrcomm_create( n_threads );
// Allocate an array of thrinfo_t pointers, one for each thread.
thrinfo_t** paths = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
// Use the blocksize id of the current (root) control tree node to
// query the top-most ways of parallelism to obtain.
bszid_t bszid = bli_cntl_bszid( cntl );
dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx );
dim_t gl_comm_id;
// Create one thrinfo_t node for each thread in the (global) communicator.
for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id )
{
dim_t work_id = gl_comm_id / ( n_threads / xx_way );
paths[ gl_comm_id ] = bli_thrinfo_create
(
gl_comm,
gl_comm_id,
xx_way,
work_id,
TRUE,
NULL
);
}
return paths;
}
//#define PRINT_THRINFO
thrinfo_t** bli_l3_thrinfo_create_full_paths
(
cntx_t* cntx
)
{
dim_t jc_way = bli_cntx_jc_way( cntx );
dim_t pc_way = bli_cntx_pc_way( cntx );
dim_t ic_way = bli_cntx_ic_way( cntx );
dim_t jr_way = bli_cntx_jr_way( cntx );
dim_t ir_way = bli_cntx_ir_way( cntx );
dim_t gl_nt = jc_way * pc_way * ic_way * jr_way * ir_way;
dim_t jc_nt = pc_way * ic_way * jr_way * ir_way;
dim_t pc_nt = ic_way * jr_way * ir_way;
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
assert( gl_nt != 0 );
#ifdef PRINT_THRINFO
printf( " jc kc ic jr ir\n" );
printf( "xx_way: %4lu %4lu %4lu %4lu %4lu\n",
jc_way, kc_way, ic_way, jr_way, ir_way );
printf( " gl jc kc pb ic pa jr ir\n" );
printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
gl_nt, jc_nt, pc_nt, pc_nt, ic_nt, ic_nt, jr_nt, ir_nt );
printf( "\n" );
printf( " gl jc kc ic jr ir\n" );
printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu\n",
global_num_threads, jc_nt, kc_nt, ic_nt, jr_nt, ir_nt );
printf( "=======================================\n" );
printf( " jc kc pb ic pa jr ir\n" );
printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
jc_way, pc_way, (dim_t)0, ic_way, (dim_t)0, jr_way, ir_way );
printf( "=================================================\n" );
#endif
thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) );
thrinfo_t** paths = bli_malloc_intl( gl_nt * sizeof( thrinfo_t* ) );
thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads );
thrcomm_t* gl_comm = bli_thrcomm_create( gl_nt );
for( int a = 0; a < jc_way; a++ )
{
thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt );
for( int b = 0; b < kc_way; b++ )
for( int b = 0; b < pc_way; b++ )
{
thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt );
thrcomm_t* pc_comm = bli_thrcomm_create( pc_nt );
for( int c = 0; c < ic_way; c++ )
{
@@ -246,73 +331,83 @@ printf( "=======================================\n" );
for( int e = 0; e < ir_way; e++ )
{
thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
//thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t pc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*pc_nt + pc_comm_id;
dim_t gl_comm_id = a*jc_nt + jc_comm_id;
// macro-kernel loops
thrinfo_t* ir_info
=
bli_l3_thrinfo_create( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL );
thrinfo_t* jr_info
=
bli_l3_thrinfo_create( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
ir_info );
// packa
thrinfo_t* pack_ic_in
thrinfo_t* pa_info
=
bli_packm_thrinfo_create( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id,
jr_info );
// blk_var1
thrinfo_t* ic_info
=
bli_l3_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
bli_l3_thrinfo_create( pc_comm, pc_comm_id,
ic_way, c,
pack_ic_in );
pa_info );
// packb
thrinfo_t* pack_kc_in
thrinfo_t* pb_info
=
bli_packm_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id,
bli_packm_thrinfo_create( pc_comm, pc_comm_id,
pc_nt, pc_comm_id,
ic_info );
// blk_var3
thrinfo_t* kc_info
thrinfo_t* pc_info
=
bli_l3_thrinfo_create( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
pack_kc_in );
pc_way, b,
pb_info );
// blk_var2
thrinfo_t* jc_info
=
bli_l3_thrinfo_create( global_comm, global_comm_id,
jc_comm, jc_comm_id,
bli_l3_thrinfo_create( gl_comm, gl_comm_id,
jc_way, a,
kc_info );
pc_info );
paths[global_comm_id] = jc_info;
paths[gl_comm_id] = jc_info;
#ifdef PRINT_THRINFO
printf( " gl jc kc ic jr ir\n" );
printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu\n",
global_comm_id, jc_comm_id, kc_comm_id, ic_comm_id, jr_comm_id, ir_comm_id );
//printf( " a b c d e\n" );
printf( "work ids: %4ld %4ld %4ld %4ld %4ld\n", (long int)a, (long int)b, (long int)c, (long int)d, (long int)e );
printf( "---------------------------------------\n" );
{
dim_t gl_comm_id = bli_thread_ocomm_id( jc_info );
dim_t jc_comm_id = bli_thread_ocomm_id( pc_info );
dim_t pc_comm_id = bli_thread_ocomm_id( pb_info );
dim_t pb_comm_id = bli_thread_ocomm_id( ic_info );
dim_t ic_comm_id = bli_thread_ocomm_id( pa_info );
dim_t pa_comm_id = bli_thread_ocomm_id( jr_info );
dim_t jr_comm_id = bli_thread_ocomm_id( ir_info );
dim_t jc_work_id = bli_thread_work_id( jc_info );
dim_t pc_work_id = bli_thread_work_id( pc_info );
dim_t pb_work_id = bli_thread_work_id( pb_info );
dim_t ic_work_id = bli_thread_work_id( ic_info );
dim_t pa_work_id = bli_thread_work_id( pa_info );
dim_t jr_work_id = bli_thread_work_id( jr_info );
dim_t ir_work_id = bli_thread_work_id( ir_info );
printf( " gl jc pb kc pa ic jr \n" );
printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id );
printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n",
jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id );
printf( "-------------------------------------------------\n" );
}
#endif
}
@@ -330,15 +425,16 @@ exit(1);
void bli_l3_thrinfo_free_paths
(
thrinfo_t** threads,
dim_t num
thrinfo_t** threads
)
{
dim_t n_threads = bli_thread_num_threads( threads[0] );
dim_t i;
for ( i = 0; i < num; ++i )
for ( i = 0; i < n_threads; ++i )
bli_l3_thrinfo_free( threads[i] );
bli_free_intl( threads );
}
#endif

View File

@@ -61,24 +61,22 @@
// thrinfo_t APIs specific to level-3 operations.
//
#if 0
thrinfo_t* bli_l3_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
);
#endif
void bli_l3_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
@@ -96,15 +94,37 @@ void bli_l3_thrinfo_free
// -----------------------------------------------------------------------------
thrinfo_t** bli_l3_thrinfo_create_paths
void bli_l3_thrinfo_create_root
(
opid_t l3_op,
side_t side
dim_t id,
thrcomm_t* gl_comm,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t** thread
);
void bli_l3_thrinfo_print_paths
(
thrinfo_t** threads
);
// -----------------------------------------------------------------------------
#if 0
thrinfo_t** bli_l3_thrinfo_create_roots
(
cntx_t* cntx,
cntl_t* cntl
);
thrinfo_t** bli_l3_thrinfo_create_full_paths
(
cntx_t* cntx
);
void bli_l3_thrinfo_free_paths
(
thrinfo_t** threads,
dim_t num
thrinfo_t** threads
);
#endif

View File

@@ -84,10 +84,10 @@ void bli_gemm_blk_var3
c,
cntx,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread)
bli_thrinfo_sub_node( thread )
);
bli_thread_ibarrier( thread );
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it

View File

@@ -46,14 +46,21 @@ cntl_t* bli_gemm_cntl_create
if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
// Create a node for the macro-kernel.
cntl_t* gemm_cntl_bp_ke = bli_gemm_cntl_obj_create
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create
(
BLIS_NR, // bszid not used by macro-kernel.
macro_kernel_p,
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create
(
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
gemm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
(
@@ -66,7 +73,7 @@ cntl_t* bli_gemm_cntl_create
FALSE, // reverse iteration if lower?
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
gemm_cntl_bp_ke
gemm_cntl_bp_bu
);
// Create a node for partitioning the m dimension by MC.

View File

@@ -85,13 +85,19 @@ void bli_gemm_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_GEMM, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx );
// Invoke the internal back-end.
// Create the first node in the thrinfo_t tree for each thread.
//thrinfo_t** infos = bli_l3_thrinfo_create_full_paths( cntx );
//bli_l3_thrinfo_print_paths( infos );
//exit(1);
//cntl = bli_gemm_cntl_create( BLIS_GEMM );
//thrinfo_t** infos = bli_l3_thrinfo_create_roots( cntx, cntl );
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -99,10 +105,12 @@ void bli_gemm_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
//bli_l3_thrinfo_print_paths( infos );
//exit(1);
bli_l3_thrinfo_free_paths( infos, n_threads );
// Free the thrinfo_t structures.
//bli_l3_thrinfo_free_paths( infos );
}

View File

@@ -50,7 +50,6 @@ void bli_gemm_int
obj_t b_local;
obj_t c_local;
gemm_voft f;
ind_t im;
// Check parameters.
if ( bli_error_checking_is_enabled() )
@@ -102,17 +101,22 @@ void bli_gemm_int
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Create the next node in the thrinfo_t structure.
bli_thrinfo_grow( cntx, cntl, thread );
// Extract the function pointer from the current control tree node.
f = bli_cntl_var_func( cntl );
// Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations.
im = bli_cntx_get_ind_method( cntx );
if ( im != BLIS_NAT )
{
if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa;
else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2;
else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
ind_t im = bli_cntx_get_ind_method( cntx );
if ( im != BLIS_NAT )
{
if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa;
else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2;
else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
}
}
// Invoke the variant.

View File

@@ -92,13 +92,12 @@ void bli_hemm_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_GEMM, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -106,10 +105,7 @@ void bli_hemm_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -110,14 +110,14 @@ void bli_her2k_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
// Invoke herk twice, using beta only the first time.
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx );
// Invoke the internal back-end.
// Invoke herk twice, using beta only the first time.
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -125,13 +125,11 @@ void bli_her2k_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
&alpha_conj,
&b_local,
@@ -139,12 +137,9 @@ void bli_her2k_front
&BLIS_ONE,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
// The Hermitian rank-2k product was computed as A*B'+B*A', even for
// the diagonal elements. Mathematically, the imaginary components of
// diagonal elements of a Hermitian rank-2k product should always be

View File

@@ -90,13 +90,12 @@ void bli_herk_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -104,12 +103,9 @@ void bli_herk_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
// The Hermitian rank-k product was computed as A*A', even for the
// diagonal elements. Mathematically, the imaginary components of
// diagonal elements of a Hermitian rank-k product should always be

View File

@@ -91,13 +91,12 @@ void bli_symm_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_GEMM, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -105,10 +104,7 @@ void bli_symm_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -91,14 +91,14 @@ void bli_syr2k_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx );
// Invoke herk twice, using beta only the first time.
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -106,13 +106,11 @@ void bli_syr2k_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&b_local,
@@ -120,10 +118,7 @@ void bli_syr2k_front
&BLIS_ONE,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -84,13 +84,12 @@ void bli_syrk_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -98,10 +97,7 @@ void bli_syrk_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -134,13 +134,12 @@ void bli_trmm_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_TRMM, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -148,10 +147,7 @@ void bli_trmm_front
&BLIS_ZERO,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -133,13 +133,12 @@ void bli_trmm3_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_TRMM, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_gemm_int,
alpha,
&a_local,
@@ -147,10 +146,7 @@ void bli_trmm3_front
beta,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -87,7 +87,8 @@ void bli_trsm_blk_var3
bli_thrinfo_sub_node( thread )
);
bli_thread_ibarrier( thread );
//bli_thread_ibarrier( thread );
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure

View File

@@ -50,14 +50,21 @@ cntl_t* bli_trsm_l_cntl_create
{
void* macro_kernel_p = bli_trsm_xx_ker_var2;
// Create a node for the macro-kernel.
cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create
// Create two nodes for the macro-kernel.
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create
(
BLIS_NR, // bszid not used by macro-kernel.
macro_kernel_p,
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create
(
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
trsm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create
(
@@ -70,7 +77,7 @@ cntl_t* bli_trsm_l_cntl_create
FALSE, // reverse iteration if lower?
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
trsm_cntl_bp_ke
trsm_cntl_bp_bu
);
// Create a node for partitioning the m dimension by MC.
@@ -122,14 +129,21 @@ cntl_t* bli_trsm_r_cntl_create
{
void* macro_kernel_p = bli_trsm_xx_ker_var2;
// Create a node for the macro-kernel.
cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create
// Create two nodes for the macro-kernel.
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create
(
BLIS_NR, // bszid not used by macro-kernel.
macro_kernel_p,
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create
(
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
trsm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create
(
@@ -142,7 +156,7 @@ cntl_t* bli_trsm_r_cntl_create
FALSE, // reverse iteration if lower?
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
trsm_cntl_bp_ke
trsm_cntl_bp_bu
);
// Create a node for partitioning the m dimension by MC.

View File

@@ -119,13 +119,12 @@ void bli_trsm_front
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_TRSM, cntx );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx );
// Invoke the internal back-end.
bli_l3_thread_decorator
(
n_threads,
bli_trsm_int,
alpha,
&a_local,
@@ -133,10 +132,7 @@ void bli_trsm_front
alpha,
&c_local,
cntx,
cntl,
infos
cntl
);
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -117,6 +117,9 @@ void bli_trsm_int
// FGVZ->TMS: Is this barrier still needed?
bli_thread_obarrier( thread );
// Create the next node in the thrinfo_t structure.
bli_thrinfo_grow( cntx, cntl, thread );
// Extract the function pointer from the current control tree node.
f = bli_cntl_var_func( cntl );

View File

@@ -107,9 +107,13 @@ void bli_cntl_free
thrinfo_t* thread_sub_node = bli_thrinfo_sub_node( thread );
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free( cntl_sub_node, thread_sub_node );
// Only recurse if the current thrinfo_t node has a child.
if ( thread_sub_node != NULL )
{
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free( cntl_sub_node, thread_sub_node );
}
// Free the current node's params field, if it is non-NULL.
if ( cntl_params != NULL )

View File

@@ -341,6 +341,37 @@ pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx )
}
#endif
dim_t bli_cntx_get_num_threads( cntx_t* cntx )
{
return bli_cntx_jc_way( cntx ) *
bli_cntx_pc_way( cntx ) *
bli_cntx_ic_way( cntx ) *
bli_cntx_jr_way( cntx ) *
bli_cntx_ir_way( cntx );
}
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl )
{
dim_t n_threads_in = 1;
for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) )
{
bszid_t bszid = bli_cntl_bszid( cntl );
dim_t cur_way;
// We assume bszid is in {KR,MR,NR,MC,KC,NR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
cur_way = bli_cntx_way_for_bszid( bszid, cntx );
else
cur_way = 1;
n_threads_in *= cur_way;
}
return n_threads_in;
}
// -----------------------------------------------------------------------------
#if 1
@@ -663,6 +694,96 @@ void bli_cntx_set_pack_schema_c( pack_t schema_c,
bli_cntx_set_schema_c( schema_c, cntx );
}
void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx )
{
dim_t jc, pc, ic, jr, ir;
#ifdef BLIS_ENABLE_MULTITHREADING
jc = bli_env_read_nway( "BLIS_JC_NT" );
//pc = bli_env_read_nway( "BLIS_KC_NT" );
pc = 1;
ic = bli_env_read_nway( "BLIS_IC_NT" );
jr = bli_env_read_nway( "BLIS_JR_NT" );
ir = bli_env_read_nway( "BLIS_IR_NT" );
#else
jc = 1;
pc = 1;
ic = 1;
jr = 1;
ir = 1;
#endif
if ( l3_op == BLIS_TRMM )
{
// We reconfigure the paralelism from trmm_r due to a dependency in
// the jc loop. (NOTE: This dependency does not exist for trmm3 )
if ( bli_is_right( side ) )
{
bli_cntx_set_thrloop
(
1,
pc,
ic,
jr * jc,
ir,
cntx
);
}
else // if ( bli_is_left( side ) )
{
bli_cntx_set_thrloop
(
jc,
pc,
ic,
jr,
ir,
cntx
);
}
}
else if ( l3_op == BLIS_TRSM )
{
if ( bli_is_right( side ) )
{
bli_cntx_set_thrloop
(
1,
1,
jc * ic * jr,
1,
1,
cntx
);
}
else // if ( bli_is_left( side ) )
{
bli_cntx_set_thrloop
(
1,
1,
1,
ic * jr * ir,
1,
cntx
);
}
}
else // if ( l3_op == BLIS_TRSM )
{
bli_cntx_set_thrloop
(
jc,
pc,
ic,
jr,
ir,
cntx
);
}
}
// -----------------------------------------------------------------------------
bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt,

View File

@@ -59,6 +59,8 @@ typedef struct cntx_s
pack_t schema_b;
pack_t schema_c;
dim_t* thrloop;
membrk_t* membrk;
} cntx_t;
*/
@@ -127,6 +129,36 @@ typedef struct cntx_s
\
( (cntx)->membrk )
#define bli_cntx_thrloop( cntx ) \
\
( (cntx)->thrloop )
#if 1
#define bli_cntx_jc_way( cntx ) \
\
( (cntx)->thrloop[ BLIS_NC ] )
#define bli_cntx_pc_way( cntx ) \
\
( (cntx)->thrloop[ BLIS_KC ] )
#define bli_cntx_ic_way( cntx ) \
\
( (cntx)->thrloop[ BLIS_MC ] )
#define bli_cntx_jr_way( cntx ) \
\
( (cntx)->thrloop[ BLIS_NR ] )
#define bli_cntx_ir_way( cntx ) \
\
( (cntx)->thrloop[ BLIS_MR ] )
#endif
#define bli_cntx_way_for_bszid( bszid, cntx ) \
\
( (cntx)->thrloop[ bszid ] )
// cntx_t modification (fields only)
#define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \
@@ -199,6 +231,16 @@ typedef struct cntx_s
(cntx_p)->membrk = _membrk; \
}
#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \
{ \
(cntx_p)->thrloop[ BLIS_NC ] = jc_; \
(cntx_p)->thrloop[ BLIS_KC ] = pc_; \
(cntx_p)->thrloop[ BLIS_MC ] = ic_; \
(cntx_p)->thrloop[ BLIS_NR ] = jr_; \
(cntx_p)->thrloop[ BLIS_MR ] = ir_; \
(cntx_p)->thrloop[ BLIS_KR ] = 1; \
}
// cntx_t query (complex)
#define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \
@@ -356,6 +398,8 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx );
dim_t bli_cntx_get_num_threads( cntx_t* cntx );
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl );
// set functions
@@ -390,6 +434,9 @@ void bli_cntx_set_pack_schema_b( pack_t schema_b,
cntx_t* cntx );
void bli_cntx_set_pack_schema_c( pack_t schema_c,
cntx_t* cntx );
void bli_cntx_set_thrloop_from_env( opid_t l3_op,
side_t side,
cntx_t* cntx );
// other query functions

View File

@@ -145,6 +145,10 @@ void bli_free_align
int8_t* p_byte;
void** p_addr;
// If the pointer to free is NULL, it was obviously not aligned and
// does not need to be freed.
if ( p == NULL ) return;
// Since the bli_malloc_pool() function returned the aligned pointer,
// we have to first recover the original pointer before we can free
// the memory.

View File

@@ -1,203 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER;
#endif
static membrk_t global_membrk;
// -----------------------------------------------------------------------------
membrk_t* bli_mem_global_membrk( void )
{
return &global_membrk;
}
siz_t bli_mem_pool_size( packbuf_t buf_type )
{
siz_t r_val;
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
// We don't (yet) track the amount of general-purpose
// memory that is currently allocated.
r_val = 0;
}
else
{
dim_t pool_index;
pool_t* pool;
// Acquire the pointer to the pool corresponding to the buf_type
// provided.
pool_index = bli_packbuf_index( buf_type );
pool = bli_membrk_pool( pool_index, &global_membrk );
// Compute the pool "size" as the product of the block size
// and the number of blocks in the pool.
r_val = bli_pool_block_size( pool ) *
bli_pool_num_blocks( pool );
}
return r_val;
}
// -----------------------------------------------------------------------------
static bool_t bli_mem_is_init = FALSE;
void bli_mem_init( void )
{
cntx_t cntx;
// If the initialization flag is TRUE, we know the API is already
// initialized, so we can return early.
if ( bli_mem_is_init == TRUE ) return;
// Create and initialize a context for gemm so we have something
// to pass into bli_mem_init_pools().
bli_gemm_cntx_init( &cntx );
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )
#endif
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_lock( &mem_manager_mutex );
#endif
// BEGIN CRITICAL SECTION
{
// Here, we test the initialization flag again. NOTE: THIS IS NOT
// REDUNDANT. This additional test is needed so that other threads
// that may be waiting to acquire the lock do not perform any
// initialization actions once they are finally allowed into this
// critical section.
if ( bli_mem_is_init == FALSE )
{
// Initialize the global membrk_t object and its memory pools.
bli_membrk_init( &cntx, &global_membrk );
// After initialization, mark the API as initialized.
bli_mem_is_init = TRUE;
}
}
// END CRITICAL SECTION
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_unlock( &mem_manager_mutex );
#endif
// Finalize the temporary gemm context.
bli_gemm_cntx_finalize( &cntx );
}
void bli_mem_reinit( cntx_t* cntx )
{
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )
#endif
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_lock( &mem_manager_mutex );
#endif
// BEGIN CRITICAL SECTION
{
// If for some reason the memory pools have not yet been
// initialized (unlikely), we emulate the body of bli_mem_init().
if ( bli_mem_is_init == FALSE )
{
// Initialize the global membrk_t object and its memory pools.
bli_membrk_init( cntx, &global_membrk );
// After initialization, mark the API as initialized.
bli_mem_is_init = TRUE;
}
else
{
// Reinitialize the global membrk_t object's memory pools.
bli_membrk_reinit_pools( cntx, &global_membrk );
}
}
// END CRITICAL SECTION
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_unlock( &mem_manager_mutex );
#endif
}
void bli_mem_finalize( void )
{
// If the initialization flag is FALSE, we know the API is already
// uninitialized, so we can return early.
if ( bli_mem_is_init == FALSE ) return;
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )
#endif
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_lock( &mem_manager_mutex );
#endif
// BEGIN CRITICAL SECTION
{
// Here, we test the initialization flag again. NOTE: THIS IS NOT
// REDUNDANT. This additional test is needed so that other threads
// that may be waiting to acquire the lock do not perform any
// finalization actions once they are finally allowed into this
// critical section.
if ( bli_mem_is_init == TRUE )
{
// Finalize the global membrk_t object and its memory pools.
bli_membrk_finalize( &global_membrk );
// After finalization, mark the API as uninitialized.
bli_mem_is_init = FALSE;
}
}
// END CRITICAL SECTION
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_unlock( &mem_manager_mutex );
#endif
}
bool_t bli_mem_is_initialized( void )
{
return bli_mem_is_init;
}

View File

@@ -1,366 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_PTHREADS
extern pthread_mutex_t mem_manager_mutex;
#endif
// Declare one memory pool structure for each block size/shape we want to
// be able to allocate.
static pool_t pools[3];
// Physically contiguous memory for each pool.
//
// Generally speaking, the pool sizes are computed in a sub-header of blis.h
// as follows:
//
// BLIS_MK_POOL_SIZE = BLIS_MAXIMUM_MC_? * BLIS_MAXIMUM_KC_? * BLIS_SIZEOF_?
//
// where "?" is the datatype that results in the largest pool size. The
// constants BLIS_KN_POOL_SIZE and BLIS_MN_POOL_SIZE are computed in a
// similar manner. All constants are computed with appropriate "padding"
// to ensure enough space given the alignments required by bli_config.h.
//
static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ];
static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ];
static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ];
#define BLIS_USE_HEAP
#ifdef BLIS_USE_HEAP
static char* pool_mk_mem = NULL;
static char* pool_kn_mem = NULL;
static char* pool_mn_mem = NULL;
#else
static char pool_mk_mem[ BLIS_MK_POOL_SIZE ];
static char pool_kn_mem[ BLIS_KN_POOL_SIZE ];
static char pool_mn_mem[ BLIS_MN_POOL_SIZE ];
#endif
void bli_mem_acquire_m( siz_t req_size,
packbuf_t buf_type,
mem_t* mem )
{
siz_t block_size;
dim_t pool_index;
pool_t* pool;
void** block_ptrs;
void* block;
gint_t i;
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
// For general-use buffer requests, such as those used by level-2
// operations, using bli_malloc() is sufficient, since using
// physically contiguous memory is not as important there.
block = bli_malloc( req_size );
// Initialize the mem_t object with:
// - the address of the memory block,
// - the buffer type (a packbuf_t value), and
// - the size of the requested region.
// NOTE: We do not initialize the pool field since this block did not
// come from a contiguous memory pool.
bli_mem_set_buffer( block, mem );
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_size( req_size, mem );
}
else
{
// This branch handles cases where the memory block needs to come
// from one of the contiguous memory pools.
// Map the requested packed buffer type to a zero-based index, which
// we then use to select the corresponding memory pool.
pool_index = bli_packbuf_index( buf_type );
pool = &pools[ pool_index ];
// Unconditionally perform error checking on the memory pool.
{
err_t e_val;
// Make sure that the requested matrix size fits inside of a block
// of the corresponding pool.
e_val = bli_check_requested_block_size_for_pool( req_size, pool );
bli_check_error_code( e_val );
// Make sure that the pool contains at least one block to check out
// to the thread.
e_val = bli_check_if_exhausted_pool( pool );
bli_check_error_code( e_val );
}
// Access the block pointer array from the memory pool data structure.
block_ptrs = bli_pool_block_ptrs( pool );
// BEGIN CRITICAL SECTION
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )
#endif
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_lock( &mem_manager_mutex );
#endif
{
// Query the index of the contiguous memory block that resides at the
// "top" of the pool.
i = bli_pool_top_index( pool );
// Extract the address of the top block from the block pointer array.
block = block_ptrs[i];
// Clear the entry from the block pointer array. (This is actually not
// necessary.)
//block_ptrs[i] = NULL;
// Decrement the top of the memory pool.
bli_pool_dec_top_index( pool );
// END CRITICAL SECTION
}
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_unlock( &mem_manager_mutex );
#endif
// Query the size of the blocks in the pool so we can store it in the
// mem_t object.
block_size = bli_pool_block_size( pool );
// Initialize the mem_t object with:
// - the address of the memory block,
// - the buffer type (a packbuf_t value),
// - the address of the memory pool to which it belongs, and
// - the size of the contiguous memory block (NOT the size of the
// requested region).
bli_mem_set_buffer( block, mem );
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_pool( pool, mem );
bli_mem_set_size( block_size, mem );
}
}
void bli_mem_release( mem_t* mem )
{
packbuf_t buf_type;
pool_t* pool;
void** block_ptrs;
void* block;
gint_t i;
// Extract the address of the memory block we are trying to
// release.
block = bli_mem_buffer( mem );
// Extract the buffer type so we know what kind of memory was allocated.
buf_type = bli_mem_buf_type( mem );
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
// For general-use buffers, we allocate with bli_malloc(), and so
// here we need to call bli_free().
bli_free( block );
}
else
{
// This branch handles cases where the memory block came from one
// of the contiguous memory pools.
// Extract the pool from which the block was allocated.
pool = bli_mem_pool( mem );
// Extract the block pointer array associated with the pool.
block_ptrs = bli_pool_block_ptrs( pool );
// BEGIN CRITICAL SECTION
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )
#endif
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_lock( &mem_manager_mutex );
#endif
{
// Increment the top of the memory pool.
bli_pool_inc_top_index( pool );
// Query the newly incremented top index.
i = bli_pool_top_index( pool );
// Place the address of the block back onto the top of the memory pool.
block_ptrs[i] = block;
// END CRITICAL SECTION
}
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_unlock( &mem_manager_mutex );
#endif
}
// Clear the mem_t object so that it appears unallocated. We clear:
// - the buffer field,
// - the pool field, and
// - the size field.
// NOTE: We do not clear the buf_type field since there is no
// "uninitialized" value for packbuf_t.
bli_mem_set_buffer( NULL, mem );
bli_mem_set_pool( NULL, mem );
bli_mem_set_size( 0, mem );
}
void bli_mem_acquire_v( siz_t req_size,
mem_t* mem )
{
bli_mem_acquire_m( req_size,
BLIS_BUFFER_FOR_GEN_USE,
mem );
}
void bli_mem_init()
{
dim_t index_a;
dim_t index_b;
dim_t index_c;
#ifdef BLIS_USE_HEAP
pool_mk_mem = bli_malloc( BLIS_MK_POOL_SIZE );
pool_kn_mem = bli_malloc( BLIS_KN_POOL_SIZE );
pool_mn_mem = bli_malloc( BLIS_MN_POOL_SIZE );
#endif
// Map each of the packbuf_t values to an index starting at zero.
index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
// Initialize contiguous memory pool for MC x KC blocks.
bli_mem_init_pool( pool_mk_mem,
BLIS_MK_BLOCK_SIZE,
BLIS_NUM_MC_X_KC_BLOCKS,
pool_mk_blk_ptrs,
&pools[ index_a ] );
// Initialize contiguous memory pool for KC x NC blocks.
bli_mem_init_pool( pool_kn_mem,
BLIS_KN_BLOCK_SIZE,
BLIS_NUM_KC_X_NC_BLOCKS,
pool_kn_blk_ptrs,
&pools[ index_b ] );
// Initialize contiguous memory pool for MC x NC blocks.
bli_mem_init_pool( pool_mn_mem,
BLIS_MN_BLOCK_SIZE,
BLIS_NUM_MC_X_NC_BLOCKS,
pool_mn_blk_ptrs,
&pools[ index_c ] );
}
void bli_mem_init_pool( char* pool_mem,
siz_t block_size,
dim_t num_blocks,
void** block_ptrs,
pool_t* pool )
{
const siz_t align_size = BLIS_CONTIG_ADDR_ALIGN_SIZE;
dim_t i;
// If the pool starting address is not already aligned, advance it
// accordingly.
if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) )
{
// Notice that this works even if the alignment is not a power of two.
pool_mem += ( ( uintptr_t )align_size -
( ( uintptr_t )pool_mem % align_size ) );
}
// Step through the memory pool, beginning with the aligned address
// determined above, assigning pointers to the beginning of each block_size
// bytes to the ith element of the block_ptrs array.
for ( i = 0; i < num_blocks; ++i )
{
// Save the address of pool, which is guaranteed to be aligned.
block_ptrs[i] = pool_mem;
// Advance pool by one block.
pool_mem += block_size;
// Advance pool a bit further if needed in order to get to the
// beginning of an alignment boundary.
if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) )
{
pool_mem += ( ( uintptr_t )align_size -
( ( uintptr_t )pool_mem % align_size ) );
}
}
// Now that we have initialized the array of pointers to the individual
// blocks in the pool, we initialize a pool_t data structure so that we
// can easily manage this pool.
bli_pool_init( num_blocks,
block_size,
block_ptrs,
pool );
}
void bli_mem_finalize()
{
// Nothing to do.
#ifdef BLIS_USE_HEAP
bli_free( pool_mk_mem );
bli_free( pool_kn_mem );
bli_free( pool_mn_mem );
#endif
}

View File

@@ -1,70 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_AUXINFO_MACRO_DEFS_H
#define BLIS_AUXINFO_MACRO_DEFS_H
// auxinfo_t field query
#define bli_auxinfo_schema_a( auxinfo ) ( (auxinfo)->schema_a )
#define bli_auxinfo_schema_b( auxinfo ) ( (auxinfo)->schema_b )
#define bli_auxinfo_next_a( auxinfo ) ( (auxinfo)->a_next )
#define bli_auxinfo_next_b( auxinfo ) ( (auxinfo)->b_next )
#define bli_auxinfo_is_a( auxinfo ) ( (auxinfo)->is_a )
#define bli_auxinfo_is_b( auxinfo ) ( (auxinfo)->is_b )
// auxinfo_t field modification
#define bli_auxinfo_set_schema_a( schema, auxinfo ) { (auxinfo).schema_a = schema; }
#define bli_auxinfo_set_schema_b( schema, auxinfo ) { (auxinfo).schema_b = schema; }
#define bli_auxinfo_set_next_a( a_p, auxinfo ) { (auxinfo).a_next = a_p; }
#define bli_auxinfo_set_next_b( b_p, auxinfo ) { (auxinfo).b_next = b_p; }
#define bli_auxinfo_set_next_ab( a_p, b_p, auxinfo ) \
{ \
bli_auxinfo_set_next_a( a_p, auxinfo ); \
bli_auxinfo_set_next_b( b_p, auxinfo ); \
}
#define bli_auxinfo_set_is_a( is, auxinfo ) { (auxinfo).is_a = is; }
#define bli_auxinfo_set_is_b( is, auxinfo ) { (auxinfo).is_b = is; }
#endif

View File

@@ -639,6 +639,21 @@ typedef enum
#define BLIS_NUM_UKR_IMPL_TYPES 4
#if 0
typedef enum
{
BLIS_JC_IDX = 0,
BLIS_PC_IDX,
BLIS_IC_IDX,
BLIS_JR_IDX,
BLIS_IR_IDX,
BLIS_PR_IDX,
} thridx_t;
#endif
#define BLIS_NUM_LOOPS 6
// -- Operation ID type --
typedef enum
@@ -950,6 +965,8 @@ typedef struct cntx_s
pack_t schema_b;
pack_t schema_c;
dim_t thrloop[ BLIS_NUM_LOOPS ];
membrk_t* membrk;
} cntx_t;

View File

@@ -41,6 +41,12 @@
#include "bli_thrcomm_openmp.h"
#include "bli_thrcomm_pthreads.h"
// thrcomm_t query (field only)
#define bli_thrcomm_num_threads( comm ) ( (comm)->n_threads )
// Thread communicator prototypes.
thrcomm_t* bli_thrcomm_create( dim_t n_threads );
void bli_thrcomm_free( thrcomm_t* communicator );

View File

@@ -201,7 +201,6 @@ void bli_thrcomm_tree_barrier( barrier_t* barack )
void bli_l3_thread_decorator
(
dim_t n_threads,
l3int_t func,
obj_t* alpha,
obj_t* a,
@@ -209,20 +208,28 @@ void bli_l3_thread_decorator
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t** thread
cntl_t* cntl
)
{
// Query the total number of threads from the context.
dim_t n_threads = bli_cntx_get_num_threads( cntx );
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* gl_comm = bli_thrcomm_create( n_threads );
_Pragma( "omp parallel num_threads(n_threads)" )
{
dim_t omp_id = omp_get_thread_num();
thrinfo_t* thread_i = thread[omp_id];
dim_t id = omp_get_thread_num();
cntl_t* cntl_use;
thrinfo_t* thread;
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
func
(
alpha,
@@ -232,12 +239,19 @@ void bli_l3_thread_decorator
c,
cntx,
cntl_use,
thread[omp_id]
thread
);
// Free the control tree, if one was created locally.
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i );
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
}
#endif

View File

@@ -136,7 +136,8 @@ typedef struct thread_data
obj_t* c;
cntx_t* cntx;
cntl_t* cntl;
thrinfo_t* thread;
dim_t id;
thrcomm_t* gl_comm;
} thread_data_t;
// Entry point for additional threads
@@ -151,13 +152,18 @@ void* bli_l3_thread_entry( void* data_void )
obj_t* c = data->c;
cntx_t* cntx = data->cntx;
cntl_t* cntl = data->cntl;
thrinfo_t* thread_i = data->thread;
dim_t id = data->id;
thrcomm_t* gl_comm = data->gl_comm;
cntl_t* cntl_use;
thrinfo_t* thread;
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
data->func
(
alpha,
@@ -171,14 +177,16 @@ void* bli_l3_thread_entry( void* data_void )
);
// Free the control tree, if one was created locally.
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i );
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( thread );
return NULL;
}
void bli_l3_thread_decorator
(
dim_t n_threads,
l3int_t func,
obj_t* alpha,
obj_t* a,
@@ -186,50 +194,51 @@ void bli_l3_thread_decorator
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t** thread
cntl_t* cntl
)
{
pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads );
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
// Query the total number of threads from the context.
dim_t n_threads = bli_cntx_get_num_threads( cntx );
for ( int i = 1; i < n_threads; i++ )
// Allocate an array of pthread objects and auxiliary data structs to pass
// to the thread entry functions.
pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads );
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* gl_comm = bli_thrcomm_create( n_threads );
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
// can spawn all other threads before proceeding with its own computation.
for ( dim_t id = n_threads - 1; 0 <= id; id-- )
{
// Set up thread data for additional threads (beyond thread 0).
datas[i].func = func;
datas[i].alpha = alpha;
datas[i].a = a;
datas[i].b = b;
datas[i].beta = beta;
datas[i].c = c;
datas[i].cntx = cntx;
datas[i].cntl = cntl;
datas[i].thread = thread[i];
datas[id].func = func;
datas[id].alpha = alpha;
datas[id].a = a;
datas[id].b = b;
datas[id].beta = beta;
datas[id].c = c;
datas[id].cntx = cntx;
datas[id].cntl = cntl;
datas[id].id = id;
datas[id].gl_comm = gl_comm;
// Spawn additional threads.
pthread_create( &pthreads[i], NULL, &bli_l3_thread_entry, &datas[i] );
}
// The main thread executes this.
{
cntl_t* cntl_use;
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
// Thread 0 simply executes func.
func( alpha, a, b, beta, c, cntx, cntl, thread[0] );
// Free the control tree, if one was created locally.
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread[0] );
// Spawn additional threads for ids greater than 1.
if ( id != 0 )
pthread_create( &pthreads[id], NULL, &bli_l3_thread_entry, &datas[id] );
else
bli_l3_thread_entry( ( void* )(&datas[0]) );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Thread 0 waits for additional threads to finish.
for ( int i = 1; i < n_threads; i++)
for ( dim_t id = 1; id < n_threads; id++ )
{
pthread_join( pthreads[i], NULL );
pthread_join( pthreads[id], NULL );
}
bli_free_intl( pthreads );

View File

@@ -73,7 +73,6 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id )
void bli_l3_thread_decorator
(
dim_t n_threads,
l3int_t func,
obj_t* alpha,
obj_t* a,
@@ -81,17 +80,25 @@ void bli_l3_thread_decorator
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t** thread
cntl_t* cntl
)
{
thrinfo_t* thread_i = thread[0];
// For sequential execution, we use only one thread.
dim_t n_threads = 1;
dim_t id = 0;
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* gl_comm = bli_thrcomm_create( n_threads );
cntl_t* cntl_use;
thrinfo_t* thread;
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
func
(
alpha,
@@ -101,11 +108,18 @@ void bli_l3_thread_decorator
c,
cntx,
cntl_use,
thread[0]
thread
);
// Free the control tree, if one was created locally.
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i );
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( thread );
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
}

View File

@@ -78,8 +78,8 @@ void bli_thread_get_range_sub
dim_t* end
)
{
dim_t n_way = thread->n_way;
dim_t work_id = thread->work_id;
dim_t n_way = bli_thread_n_way( thread );
dim_t work_id = bli_thread_work_id( thread );
dim_t all_start = 0;
dim_t all_end = n;
@@ -511,8 +511,8 @@ siz_t bli_thread_get_range_weighted_sub
dim_t* j_end_thr
)
{
dim_t n_way = thread->n_way;
dim_t my_id = thread->work_id;
dim_t n_way = bli_thread_n_way( thread );
dim_t my_id = bli_thread_work_id( thread );
dim_t bf_left = n % bf;

View File

@@ -173,16 +173,14 @@ typedef void (*l3int_t)
// Level-3 thread decorator prototype
void bli_l3_thread_decorator
(
dim_t n_threads,
l3int_t func,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t** thread
l3int_t func,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl
);
// Miscellaneous prototypes

View File

@@ -38,11 +38,9 @@ thrinfo_t* bli_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
bool_t free_comms,
bool_t free_comm,
thrinfo_t* sub_node
)
{
@@ -52,9 +50,8 @@ thrinfo_t* bli_thrinfo_create
(
thread,
ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
free_comms,
free_comm,
sub_node
);
@@ -66,23 +63,19 @@ void bli_thrinfo_init
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
bool_t free_comms,
bool_t free_comm,
thrinfo_t* sub_node
)
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->free_comms = free_comms;
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->free_comm = free_comm;
thread->sub_node = sub_node;
thread->sub_node = sub_node;
}
void bli_thrinfo_init_single
@@ -94,7 +87,6 @@ void bli_thrinfo_init_single
(
thread,
&BLIS_SINGLE_COMM, 0,
&BLIS_SINGLE_COMM, 0,
1,
0,
FALSE,
@@ -102,3 +94,178 @@ void bli_thrinfo_init_single
);
}
// -----------------------------------------------------------------------------
#include "assert.h"
#define BLIS_NUM_STATIC_COMMS 18
thrinfo_t* bli_thrinfo_create_for_cntl
(
cntx_t* cntx,
cntl_t* cntl_par,
cntl_t* cntl_chl,
thrinfo_t* thread_par
)
{
thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ];
thrcomm_t** new_comms = NULL;
thrinfo_t* thread_chl;
bszid_t bszid_chl = bli_cntl_bszid( cntl_chl );
dim_t parent_nt_in = bli_thread_num_threads( thread_par );
dim_t parent_n_way = bli_thread_n_way( thread_par );
dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
dim_t parent_work_id = bli_thread_work_id( thread_par );
dim_t child_nt_in;
dim_t child_comm_id;
dim_t child_n_way;
dim_t child_work_id;
// Sanity check: make sure the number of threads in the parent's
// communicator is divisible by the number of new sub-groups.
assert( parent_nt_in % parent_n_way == 0 );
// Compute:
// - the number of threads inside the new child comm,
// - the current thread's id within the new communicator,
// - the current thread's work id, given the ways of parallelism
// to be obtained within the next loop.
child_nt_in = bli_cntx_get_num_threads_in( cntx, cntl_chl );
child_n_way = bli_cntx_way_for_bszid( bszid_chl, cntx );
child_comm_id = parent_comm_id % child_nt_in;
child_work_id = child_comm_id / ( child_nt_in / child_n_way );
// The parent's chief thread creates a temporary array of thrcomm_t
// pointers.
if ( bli_thread_am_ochief( thread_par ) )
{
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ) );
else
new_comms = static_comms;
}
// Broadcast the temporary array to all threads in the parent's
// communicator.
new_comms = bli_thread_obroadcast( thread_par, new_comms );
// Chiefs in the child communicator allocate the communicator
// object and store it in the array element corresponding to the
// parent's work id.
if ( child_comm_id == 0 )
new_comms[ parent_work_id ] = bli_thrcomm_create( child_nt_in );
bli_thread_obarrier( thread_par );
// All threads create a new thrinfo_t node using the communicator
// that was created by their chief, as identified by parent_work_id.
thread_chl = bli_thrinfo_create
(
new_comms[ parent_work_id ],
child_comm_id,
child_n_way,
child_work_id,
TRUE,
NULL
);
bli_thread_obarrier( thread_par );
// The parent's chief thread frees the temporary array of thrcomm_t
// pointers.
if ( bli_thread_am_ochief( thread_par ) )
{
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
bli_free_intl( new_comms );
}
return thread_chl;
}
void bli_thrinfo_grow
(
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
// If the sub-node of the thrinfo_t object is non-NULL, we don't
// need to create it, and will just use the existing sub-node as-is.
if ( bli_thrinfo_sub_node( thread ) != NULL ) return;
// Create a new node (or, if needed, multiple nodes) and return the
// pointer to the (eldest) child.
thrinfo_t* thread_child = bli_thrinfo_rgrow
(
cntx,
cntl,
bli_cntl_sub_node( cntl ),
thread
);
// Attach the child thrinfo_t node to its parent structure.
bli_thrinfo_set_sub_node( thread_child, thread );
}
thrinfo_t* bli_thrinfo_rgrow
(
cntx_t* cntx,
cntl_t* cntl_par,
cntl_t* cntl_cur,
thrinfo_t* thread_par
)
{
thrinfo_t* thread_cur;
// We must handle two cases: those where the next node in the
// control tree is a partitioning node, and those where it is
// a non-partitioning (ie: packing) node.
if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART )
{
// Create the child thrinfo_t node corresponding to cntl_cur,
// with cntl_par being the parent.
thread_cur = bli_thrinfo_create_for_cntl
(
cntx,
cntl_par,
cntl_cur,
thread_par
);
}
else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART )
{
// Recursively grow the thread structure and return the top-most
// thrinfo_t node of that segment.
thrinfo_t* thread_seg = bli_thrinfo_rgrow
(
cntx,
cntl_par,
bli_cntl_sub_node( cntl_cur ),
thread_par
);
// Create a thrinfo_t node corresponding to cntl_cur. Notice that
// the free_comm field is set to FALSE, since cntl_cur is a
// non-partitioning node. The communicator used here will be
// freed when thread_seg, or one of its descendents, is freed.
thread_cur = bli_thrinfo_create
(
bli_thrinfo_ocomm( thread_seg ),
bli_thread_ocomm_id( thread_seg ),
bli_cntx_get_num_threads_in( cntx, cntl_cur ),
bli_thread_ocomm_id( thread_seg ),
FALSE,
thread_seg
);
// Attach the child thrinfo_t node to its parent structure.
bli_thrinfo_set_sub_node( thread_cur, thread_par );
}
return thread_cur;
}

View File

@@ -45,13 +45,6 @@ struct thrinfo_s
// Our thread id within the ocomm thread communicator.
dim_t ocomm_id;
// The thread communicator for the other threads sharing the same work
// at this level.
thrcomm_t* icomm;
// Our thread id within the icomm thread communicator.
dim_t icomm_id;
// The number of distinct threads used to parallelize the loop.
dim_t n_way;
@@ -62,7 +55,7 @@ struct thrinfo_s
// this is field is true, but when nodes are created that share the same
// communicators as other nodes (such as with packm nodes), this is set
// to false.
bool_t free_comms;
bool_t free_comm;
struct thrinfo_s* sub_node;
};
@@ -71,30 +64,40 @@ typedef struct thrinfo_s thrinfo_t;
//
// thrinfo_t macros
// NOTE: The naming of these should be made consistent at some point.
// (ie: bli_thrinfo_ vs. bli_thread_)
//
#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads )
// thrinfo_t query (field only)
#define bli_thread_n_way( t ) ( (t)->n_way )
#define bli_thread_work_id( t ) ( (t)->work_id )
#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads )
#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 )
#define bli_thread_am_ichief( t ) ( (t)->icomm_id == 0 )
#define bli_thread_n_way( t ) ( (t)->n_way )
#define bli_thread_work_id( t ) ( (t)->work_id )
#define bli_thread_ocomm_id( t ) ( (t)->ocomm_id )
#define bli_thrinfo_ocomm( t ) ( (t)->ocomm )
#define bli_thrinfo_needs_free_comm( t ) ( (t)->free_comm )
#define bli_thrinfo_sub_node( t ) ( (t)->sub_node )
// thrinfo_t query (complex)
#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 )
// thrinfo_t modification
#define bli_thrinfo_set_sub_node( _sub_node, thread ) \
{ \
(thread)->sub_node = _sub_node; \
}
// other thrinfo_t-related macros
#define bli_thread_obroadcast( t, p ) bli_thrcomm_bcast( (t)->ocomm, \
(t)->ocomm_id, p )
#define bli_thread_ibroadcast( t, p ) bli_thrcomm_bcast( (t)->icomm, \
(t)->icomm_id, p )
#define bli_thread_obarrier( t ) bli_thrcomm_barrier( (t)->ocomm, \
(t)->ocomm_id )
#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( (t)->icomm, \
(t)->icomm_id )
#define bli_thrinfo_ocomm( t ) ( (t)->ocomm )
#define bli_thrinfo_icomm( t ) ( (t)->icomm )
#define bli_thrinfo_needs_free_comms( t ) ( (t)->free_comms )
#define bli_thrinfo_sub_node( t ) ( (t)->sub_node )
//
// Prototypes for level-3 thrinfo functions not specific to any operation.
@@ -104,11 +107,9 @@ thrinfo_t* bli_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
bool_t free_comms,
bool_t free_comm,
thrinfo_t* sub_node
);
@@ -117,11 +118,9 @@ void bli_thrinfo_init
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
bool_t free_comms,
bool_t free_comm,
thrinfo_t* sub_node
);
@@ -130,9 +129,29 @@ void bli_thrinfo_init_single
thrinfo_t* thread
);
void bli_thrinfo_free
// -----------------------------------------------------------------------------
thrinfo_t* bli_thrinfo_create_for_cntl
(
cntx_t* cntx,
cntl_t* cntl_par,
cntl_t* cntl_chl,
thrinfo_t* thread_par
);
void bli_thrinfo_grow
(
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
);
thrinfo_t* bli_thrinfo_rgrow
(
cntx_t* cntx,
cntl_t* cntl_par,
cntl_t* cntl_cur,
thrinfo_t* thread_par
);
#endif

View File

@@ -1 +1 @@
0.2.0-37
0.2.1