mirror of
https://github.com/amd/blis.git
synced 2026-05-13 10:35:38 +00:00
Avoid gemmsup barriers when not packing A or B. (#622)
Details: - Implemented a multithreaded optimization for the special (and common) case of employing the gemmsup code path when the user requests (implicitly or explicitly) that neither A nor B be packed during computation. This optimization takes the form of a greatly reduced code branch in bli_thrinfo_sup_create_for_cntl(), which avoids a broadcast and two barriers, and results in higher performance when obtaining two-way or higher parallelism within BLIS. Thanks to Bhaskar Nallani of AMD for proposing this change via issue #605. - Added an early return branch to bli_thrinfo_create_for_cntl() that detects and quickly handles cases where no parallelism is being obtained within BLIS (i.e., single-threaded execution). Note that this special case handling was/is already present in bli_thrinfo_sup_create_for_cntl(). - CREDITS file update.
This commit is contained in:
1
CREDITS
1
CREDITS
@@ -64,6 +64,7 @@ but many others have contributed code and feedback, including
|
|||||||
Simon Lukas Märtens @ACSimon33 (RWTH Aachen University)
|
Simon Lukas Märtens @ACSimon33 (RWTH Aachen University)
|
||||||
Devin Matthews @devinamatthews (The University of Texas at Austin)
|
Devin Matthews @devinamatthews (The University of Texas at Austin)
|
||||||
Stefanos Mavros @smavros
|
Stefanos Mavros @smavros
|
||||||
|
Mithun Mohan @MithunMohanKadavil (AMD)
|
||||||
Ilknur Mustafazade @Runkli
|
Ilknur Mustafazade @Runkli
|
||||||
@nagsingh
|
@nagsingh
|
||||||
Bhaskar Nallani @BhaskarNallani (AMD)
|
Bhaskar Nallani @BhaskarNallani (AMD)
|
||||||
|
|||||||
@@ -298,6 +298,24 @@ thrinfo_t* bli_thrinfo_create_for_cntl
|
|||||||
thrinfo_t* thread_par
|
thrinfo_t* thread_par
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
// If we are running with a single thread, all of the code can be reduced
|
||||||
|
// and simplified to this.
|
||||||
|
if ( bli_rntm_calc_num_threads( rntm ) == 1 )
|
||||||
|
{
|
||||||
|
thrinfo_t* thread_chl = bli_thrinfo_create
|
||||||
|
(
|
||||||
|
rntm, // rntm
|
||||||
|
&BLIS_SINGLE_COMM, // ocomm
|
||||||
|
0, // ocomm_id
|
||||||
|
1, // n_way
|
||||||
|
0, // work_id
|
||||||
|
FALSE, // free_comm
|
||||||
|
BLIS_NO_PART, // bszid
|
||||||
|
NULL // sub_node
|
||||||
|
);
|
||||||
|
return thread_chl;
|
||||||
|
}
|
||||||
|
|
||||||
thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ];
|
thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ];
|
||||||
thrcomm_t** new_comms = NULL;
|
thrcomm_t** new_comms = NULL;
|
||||||
|
|
||||||
|
|||||||
@@ -145,7 +145,6 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
|
|||||||
thrinfo_t* thread_par
|
thrinfo_t* thread_par
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
#if 1
|
|
||||||
// If we are running with a single thread, all of the code can be reduced
|
// If we are running with a single thread, all of the code can be reduced
|
||||||
// and simplified to this.
|
// and simplified to this.
|
||||||
if ( bli_rntm_calc_num_threads( rntm ) == 1 )
|
if ( bli_rntm_calc_num_threads( rntm ) == 1 )
|
||||||
@@ -163,84 +162,128 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
|
|||||||
);
|
);
|
||||||
return thread_chl;
|
return thread_chl;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ];
|
// The remainder of this function handles the cases involving the use of
|
||||||
thrcomm_t** new_comms = NULL;
|
// multiple BLIS threads.
|
||||||
|
|
||||||
const dim_t parent_nt_in = bli_thread_num_threads( thread_par );
|
if ( bli_rntm_pack_a( rntm ) == FALSE &&
|
||||||
const dim_t parent_n_way = bli_thread_n_way( thread_par );
|
bli_rntm_pack_b( rntm ) == FALSE )
|
||||||
const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
|
|
||||||
const dim_t parent_work_id = bli_thread_work_id( thread_par );
|
|
||||||
|
|
||||||
// Sanity check: make sure the number of threads in the parent's
|
|
||||||
// communicator is divisible by the number of new sub-groups.
|
|
||||||
if ( parent_nt_in % parent_n_way != 0 )
|
|
||||||
{
|
{
|
||||||
printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
|
// If we are packing neither A nor B, there are no broadcasts or barriers
|
||||||
bli_abort();
|
// needed to synchronize threads (since all threads can work completely
|
||||||
}
|
// independently). In this special case situation, the thrinfo_t can be
|
||||||
|
// created with much simpler logic.
|
||||||
|
|
||||||
// Compute:
|
const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
|
||||||
// - the number of threads inside the new child comm,
|
|
||||||
// - the current thread's id within the new communicator,
|
// Compute:
|
||||||
// - the current thread's work id, given the ways of parallelism
|
// - the number of threads inside the new child comm,
|
||||||
// to be obtained within the next loop.
|
// - the current thread's id within the new communicator,
|
||||||
const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
|
// - the current thread's work id, given the ways of parallelism
|
||||||
const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm );
|
// to be obtained within the next loop.
|
||||||
const dim_t child_comm_id = parent_comm_id % child_nt_in;
|
const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
|
||||||
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
|
const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm );
|
||||||
|
const dim_t child_comm_id = parent_comm_id % child_nt_in;
|
||||||
|
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
|
||||||
|
|
||||||
|
// All threads create a new thrinfo_t node using the communicator
|
||||||
|
// that was created by their chief, as identified by parent_work_id.
|
||||||
|
thrinfo_t* thread_chl = bli_thrinfo_create
|
||||||
|
(
|
||||||
|
rntm, // rntm
|
||||||
|
NULL, // ocomm
|
||||||
|
child_comm_id, // ocomm_id
|
||||||
|
child_n_way, // n_way
|
||||||
|
child_work_id, // work_id
|
||||||
|
TRUE, // free_comm
|
||||||
|
*bszid_chl, // bszid
|
||||||
|
NULL // sub_node
|
||||||
|
);
|
||||||
|
|
||||||
|
return thread_chl;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// If we are packing at least one of A or B, then we use the general
|
||||||
|
// approach that employs broadcasts and barriers.
|
||||||
|
|
||||||
|
thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ];
|
||||||
|
thrcomm_t** new_comms = NULL;
|
||||||
|
|
||||||
|
const dim_t parent_nt_in = bli_thread_num_threads( thread_par );
|
||||||
|
const dim_t parent_n_way = bli_thread_n_way( thread_par );
|
||||||
|
const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
|
||||||
|
const dim_t parent_work_id = bli_thread_work_id( thread_par );
|
||||||
|
|
||||||
|
// Sanity check: make sure the number of threads in the parent's
|
||||||
|
// communicator is divisible by the number of new sub-groups.
|
||||||
|
if ( parent_nt_in % parent_n_way != 0 )
|
||||||
|
{
|
||||||
|
printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
|
||||||
|
bli_abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute:
|
||||||
|
// - the number of threads inside the new child comm,
|
||||||
|
// - the current thread's id within the new communicator,
|
||||||
|
// - the current thread's work id, given the ways of parallelism
|
||||||
|
// to be obtained within the next loop.
|
||||||
|
const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
|
||||||
|
const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm );
|
||||||
|
const dim_t child_comm_id = parent_comm_id % child_nt_in;
|
||||||
|
const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
|
||||||
|
|
||||||
//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
|
//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
|
||||||
|
|
||||||
// The parent's chief thread creates a temporary array of thrcomm_t
|
// The parent's chief thread creates a temporary array of thrcomm_t
|
||||||
// pointers.
|
// pointers.
|
||||||
if ( bli_thread_am_ochief( thread_par ) )
|
if ( bli_thread_am_ochief( thread_par ) )
|
||||||
{
|
{
|
||||||
err_t r_val;
|
err_t r_val;
|
||||||
|
|
||||||
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
|
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
|
||||||
new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
|
new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
|
||||||
else
|
else
|
||||||
new_comms = static_comms;
|
new_comms = static_comms;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Broadcast the temporary array to all threads in the parent's
|
||||||
|
// communicator.
|
||||||
|
new_comms = bli_thread_broadcast( thread_par, new_comms );
|
||||||
|
|
||||||
|
// Chiefs in the child communicator allocate the communicator
|
||||||
|
// object and store it in the array element corresponding to the
|
||||||
|
// parent's work id.
|
||||||
|
if ( child_comm_id == 0 )
|
||||||
|
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
|
||||||
|
|
||||||
|
bli_thread_barrier( thread_par );
|
||||||
|
|
||||||
|
// All threads create a new thrinfo_t node using the communicator
|
||||||
|
// that was created by their chief, as identified by parent_work_id.
|
||||||
|
thrinfo_t* thread_chl = bli_thrinfo_create
|
||||||
|
(
|
||||||
|
rntm, // rntm
|
||||||
|
new_comms[ parent_work_id ], // ocomm
|
||||||
|
child_comm_id, // ocomm_id
|
||||||
|
child_n_way, // n_way
|
||||||
|
child_work_id, // work_id
|
||||||
|
TRUE, // free_comm
|
||||||
|
*bszid_chl, // bszid
|
||||||
|
NULL // sub_node
|
||||||
|
);
|
||||||
|
|
||||||
|
bli_thread_barrier( thread_par );
|
||||||
|
|
||||||
|
// The parent's chief thread frees the temporary array of thrcomm_t
|
||||||
|
// pointers.
|
||||||
|
if ( bli_thread_am_ochief( thread_par ) )
|
||||||
|
{
|
||||||
|
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
|
||||||
|
bli_free_intl( new_comms );
|
||||||
|
}
|
||||||
|
|
||||||
|
return thread_chl;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Broadcast the temporary array to all threads in the parent's
|
|
||||||
// communicator.
|
|
||||||
new_comms = bli_thread_broadcast( thread_par, new_comms );
|
|
||||||
|
|
||||||
// Chiefs in the child communicator allocate the communicator
|
|
||||||
// object and store it in the array element corresponding to the
|
|
||||||
// parent's work id.
|
|
||||||
if ( child_comm_id == 0 )
|
|
||||||
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
|
|
||||||
|
|
||||||
bli_thread_barrier( thread_par );
|
|
||||||
|
|
||||||
// All threads create a new thrinfo_t node using the communicator
|
|
||||||
// that was created by their chief, as identified by parent_work_id.
|
|
||||||
thrinfo_t* thread_chl = bli_thrinfo_create
|
|
||||||
(
|
|
||||||
rntm, // rntm
|
|
||||||
new_comms[ parent_work_id ], // ocomm
|
|
||||||
child_comm_id, // ocomm_id
|
|
||||||
child_n_way, // n_way
|
|
||||||
child_work_id, // work_id
|
|
||||||
TRUE, // free_comm
|
|
||||||
*bszid_chl, // bszid
|
|
||||||
NULL // sub_node
|
|
||||||
);
|
|
||||||
|
|
||||||
bli_thread_barrier( thread_par );
|
|
||||||
|
|
||||||
// The parent's chief thread frees the temporary array of thrcomm_t
|
|
||||||
// pointers.
|
|
||||||
if ( bli_thread_am_ochief( thread_par ) )
|
|
||||||
{
|
|
||||||
if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
|
|
||||||
bli_free_intl( new_comms );
|
|
||||||
}
|
|
||||||
|
|
||||||
return thread_chl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user