mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Renamed bli_thread_obarrier(), _obroadcast().
Details:
- Renamed two bli_thread_*() APIs:
bli_thread_obarrier() -> bli_thread_barrier()
bli_thread_obroadcast() -> bli_thread_broadcast()
The 'o' was a leftover from when thrcomm_t objects tracked both
"inner" and "outer" communicators. They have long since been
simplified to only support the latter, and thus the 'o' is
superfluous.
This commit is contained in:
@@ -667,7 +667,7 @@ if ( col_stored ) { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -678,7 +678,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
@@ -691,7 +691,7 @@ else { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -702,7 +702,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
|
||||
@@ -73,6 +73,6 @@ void bli_unpackm_int
|
||||
}
|
||||
|
||||
// Barrier so that unpacking is done before computation.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ void bli_l3_packm
|
||||
siz_t size_needed;
|
||||
|
||||
// FGVZ: Not sure why we need this barrier, but we do.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Every thread initializes x_pack and determines the size of memory
|
||||
// block needed (which gets embedded into the otherwise "blank" mem_t
|
||||
@@ -102,7 +102,7 @@ void bli_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the contents of the chief thread's local mem_t entry to the
|
||||
// mem_t field in this thread's control tree node.
|
||||
@@ -146,7 +146,7 @@ void bli_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the chief thread's local mem_t entry to the mem_t field in
|
||||
// this thread's control tree node.
|
||||
@@ -159,7 +159,7 @@ void bli_l3_packm
|
||||
// will already have the cached values in their local control
|
||||
// trees' mem_t entries, currently pointed to by cntl_mem_p.
|
||||
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,6 +182,6 @@ void bli_l3_packm
|
||||
);
|
||||
|
||||
// Barrier so that packing is done before computation.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -237,7 +237,7 @@ if ( col_stored ) { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -248,7 +248,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
@@ -261,7 +261,7 @@ else { \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
@@ -272,7 +272,7 @@ bli_thread_obarrier( thread ); \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
|
||||
@@ -84,7 +84,7 @@ void bli_gemm_blk_var3
|
||||
bli_thrinfo_sub_node( thread )
|
||||
);
|
||||
|
||||
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
|
||||
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
|
||||
@@ -66,7 +66,7 @@ void bli_gemm_int
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -80,7 +80,7 @@ void bli_gemm_int
|
||||
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -117,7 +117,7 @@ void bli_trsm_blk_var1
|
||||
// We must execute a barrier here because the upcoming rank-k update
|
||||
// requires the packed matrix B to be fully updated by the trsm
|
||||
// subproblem.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Isolate the remaining part of the column panel matrix A, which we do by
|
||||
// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending
|
||||
|
||||
@@ -85,7 +85,7 @@ void bli_trsm_blk_var3
|
||||
);
|
||||
|
||||
//bli_thread_ibarrier( thread );
|
||||
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
|
||||
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal alpha scalars on A/B and C are non-zero, we must ensure
|
||||
|
||||
@@ -68,7 +68,7 @@ void bli_trsm_int
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ void bli_trsm_int
|
||||
}
|
||||
|
||||
// FGVZ->TMS: Is this barrier still needed?
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Create the next node in the thrinfo_t structure.
|
||||
bli_thrinfo_grow( rntm, cntl, thread );
|
||||
|
||||
@@ -340,7 +340,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
|
||||
|
||||
// Broadcast the temporary array to all threads in the parent's
|
||||
// communicator.
|
||||
new_comms = bli_thread_obroadcast( thread_par, new_comms );
|
||||
new_comms = bli_thread_broadcast( thread_par, new_comms );
|
||||
|
||||
// Chiefs in the child communicator allocate the communicator
|
||||
// object and store it in the array element corresponding to the
|
||||
@@ -348,7 +348,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
|
||||
if ( child_comm_id == 0 )
|
||||
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// All threads create a new thrinfo_t node using the communicator
|
||||
// that was created by their chief, as identified by parent_work_id.
|
||||
@@ -364,7 +364,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl
|
||||
NULL // sub_node
|
||||
);
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// The parent's chief thread frees the temporary array of thrcomm_t
|
||||
// pointers.
|
||||
@@ -477,7 +477,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
|
||||
const dim_t child_comm_id = parent_comm_id % child_nt_in;
|
||||
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
|
||||
// parent's chief-ness is equivalent to checking for chief-ness in the new
|
||||
@@ -488,7 +488,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
|
||||
|
||||
// Broadcast the new thrcomm_t address to the other threads in the
|
||||
// parent's group.
|
||||
new_comm = bli_thread_obroadcast( thread_par, new_comm );
|
||||
new_comm = bli_thread_broadcast( thread_par, new_comm );
|
||||
|
||||
// All threads create a new thrinfo_t node using the communicator
|
||||
// that was created by their chief, as identified by parent_work_id.
|
||||
@@ -504,7 +504,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode
|
||||
NULL // sub_node
|
||||
);
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
return thread_chl;
|
||||
}
|
||||
|
||||
@@ -141,12 +141,12 @@ static void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t )
|
||||
|
||||
// other thrinfo_t-related functions
|
||||
|
||||
static void* bli_thread_obroadcast( thrinfo_t* t, void* p )
|
||||
static void* bli_thread_broadcast( thrinfo_t* t, void* p )
|
||||
{
|
||||
return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
|
||||
}
|
||||
|
||||
static void bli_thread_obarrier( thrinfo_t* t )
|
||||
static void bli_thread_barrier( thrinfo_t* t )
|
||||
{
|
||||
bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ void blx_l3_packm
|
||||
siz_t size_needed;
|
||||
|
||||
// FGVZ: Not sure why we need this barrier, but we do.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Every thread initializes x_pack and determines the size of memory
|
||||
// block needed (which gets embedded into the otherwise "blank" mem_t
|
||||
@@ -102,7 +102,7 @@ void blx_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the contents of the chief thread's local mem_t entry to the
|
||||
// mem_t field in this thread's control tree node.
|
||||
@@ -142,7 +142,7 @@ void blx_l3_packm
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_obroadcast( thread, &local_mem_s );
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the chief thread's local mem_t entry to the mem_t field in
|
||||
// this thread's control tree node.
|
||||
@@ -155,7 +155,7 @@ void blx_l3_packm
|
||||
// will already have the cached values in their local control
|
||||
// trees' mem_t entries, currently pointed to by cntl_mem_p.
|
||||
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,6 +178,6 @@ void blx_l3_packm
|
||||
);
|
||||
|
||||
// Barrier so that packing is done before computation.
|
||||
bli_thread_obarrier( thread );
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ void blx_gemm_blk_var3
|
||||
bli_thrinfo_sub_node( thread )
|
||||
);
|
||||
|
||||
bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );
|
||||
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
|
||||
Reference in New Issue
Block a user