Renamed bli_thread_obarrier(), _obroadcast().

Details:
- Renamed two bli_thread_*() APIs:
    bli_thread_obarrier()   -> bli_thread_barrier()
    bli_thread_obroadcast() -> bli_thread_broadcast()
  The 'o' was a leftover from when thrcomm_t objects tracked both
  "inner" and "outer" communicators. They have long since been
  simplified to only support the latter, and thus the 'o' is
  superfluous.
This commit is contained in:
Field G. Van Zee
2020-02-25 14:50:53 -06:00
parent f6e6bf73e6
commit c01d249d7c
13 changed files with 35 additions and 35 deletions

View File

@@ -667,7 +667,7 @@ if ( col_stored ) { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -678,7 +678,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
@@ -691,7 +691,7 @@ else { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -702,7 +702,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
*/
/*

View File

@@ -73,6 +73,6 @@ void bli_unpackm_int
}
// Barrier so that unpacking is done before computation.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}

View File

@@ -50,7 +50,7 @@ void bli_l3_packm
siz_t size_needed;
// FGVZ: Not sure why we need this barrier, but we do.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Every thread initializes x_pack and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
@@ -102,7 +102,7 @@ void bli_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the contents of the chief thread's local mem_t entry to the
// mem_t field in this thread's control tree node.
@@ -146,7 +146,7 @@ void bli_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the chief thread's local mem_t entry to the mem_t field in
// this thread's control tree node.
@@ -159,7 +159,7 @@ void bli_l3_packm
// will already have the cached values in their local control
// trees' mem_t entries, currently pointed to by cntl_mem_p.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}
}
@@ -182,6 +182,6 @@ void bli_l3_packm
);
// Barrier so that packing is done before computation.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}

View File

@@ -237,7 +237,7 @@ if ( col_stored ) { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -248,7 +248,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
@@ -261,7 +261,7 @@ else { \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
@@ -272,7 +272,7 @@ bli_thread_obarrier( thread ); \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
*/
/*

View File

@@ -84,7 +84,7 @@ void bli_gemm_blk_var3
bli_thrinfo_sub_node( thread )
);
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it

View File

@@ -66,7 +66,7 @@ void bli_gemm_int
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
return;
}
@@ -80,7 +80,7 @@ void bli_gemm_int
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
return;
}

View File

@@ -117,7 +117,7 @@ void bli_trsm_blk_var1
// We must execute a barrier here because the upcoming rank-k update
// requires the packed matrix B to be fully updated by the trsm
// subproblem.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Isolate the remaining part of the column panel matrix A, which we do by
// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending

View File

@@ -85,7 +85,7 @@ void bli_trsm_blk_var3
);
//bli_thread_ibarrier( thread );
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure

View File

@@ -68,7 +68,7 @@ void bli_trsm_int
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
return;
}
@@ -119,7 +119,7 @@ void bli_trsm_int
}
// FGVZ->TMS: Is this barrier still needed?
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Create the next node in the thrinfo_t structure.
bli_thrinfo_grow( rntm, cntl, thread );

View File

@@ -340,7 +340,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
// Broadcast the temporary array to all threads in the parent's
// communicator.
new_comms = bli_thread_obroadcast( thread_par, new_comms );
new_comms = bli_thread_broadcast( thread_par, new_comms );
// Chiefs in the child communicator allocate the communicator
// object and store it in the array element corresponding to the
@@ -348,7 +348,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
if ( child_comm_id == 0 )
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// All threads create a new thrinfo_t node using the communicator
// that was created by their chief, as identified by parent_work_id.
@@ -364,7 +364,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
NULL // sub_node
);
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// The parent's chief thread frees the temporary array of thrcomm_t
// pointers.
@@ -477,7 +477,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
const dim_t child_comm_id = parent_comm_id % child_nt_in;
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
// parent's chief-ness is equivalent to checking for chief-ness in the new
@@ -488,7 +488,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
// Broadcast the new thrcomm_t address to the other threads in the
// parent's group.
new_comm = bli_thread_obroadcast( thread_par, new_comm );
new_comm = bli_thread_broadcast( thread_par, new_comm );
// All threads create a new thrinfo_t node using the communicator
// that was created by their chief, as identified by parent_work_id.
@@ -504,7 +504,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
NULL // sub_node
);
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
return thread_chl;
}

View File

@@ -141,12 +141,12 @@ static void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t )
// other thrinfo_t-related functions
static void* bli_thread_obroadcast( thrinfo_t* t, void* p )
static void* bli_thread_broadcast( thrinfo_t* t, void* p )
{
return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
}
static void bli_thread_obarrier( thrinfo_t* t )
static void bli_thread_barrier( thrinfo_t* t )
{
bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
}

View File

@@ -51,7 +51,7 @@ void blx_l3_packm
siz_t size_needed;
// FGVZ: Not sure why we need this barrier, but we do.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
// Every thread initializes x_pack and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
@@ -102,7 +102,7 @@ void blx_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the contents of the chief thread's local mem_t entry to the
// mem_t field in this thread's control tree node.
@@ -142,7 +142,7 @@ void blx_l3_packm
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the chief thread's local mem_t entry to the mem_t field in
// this thread's control tree node.
@@ -155,7 +155,7 @@ void blx_l3_packm
// will already have the cached values in their local control
// trees' mem_t entries, currently pointed to by cntl_mem_p.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}
}
@@ -178,6 +178,6 @@ void blx_l3_packm
);
// Barrier so that packing is done before computation.
bli_thread_obarrier( thread );
bli_thread_barrier( thread );
}

View File

@@ -73,7 +73,7 @@ void blx_gemm_blk_var3
bli_thrinfo_sub_node( thread )
);
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it