mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Initial commit to enable threading in TRSM,
Also enabled weighted partitioning for herk, trmm Fixed bug where multiple threads would try to modify the same state in the internal level 3 functions Correctly computed a_next and b_next for gemm, herk macrokernels a_next and b_next point to the current micropanels in trmm
This commit is contained in:
@@ -115,14 +115,6 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
// only for the first iteration (and then BLIS_ONE for all others).
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
// Packing must be done before computation.
|
||||
thread_ibarrier( thread );
|
||||
@@ -136,6 +128,14 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
// only for the first iteration (and then BLIS_ONE for all others).
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
@@ -79,7 +79,9 @@ void bli_gemm_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -88,7 +90,9 @@ void bli_gemm_int( obj_t* alpha,
|
||||
if ( bli_obj_is_zeros( *a ) ||
|
||||
bli_obj_is_zeros( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -106,23 +110,28 @@ void bli_gemm_int( obj_t* alpha,
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
}
|
||||
}
|
||||
|
||||
// If alpha is non-unit, typecast and apply it to the scalar attached
|
||||
// to B.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar attached
|
||||
// to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
|
||||
@@ -249,11 +249,11 @@ void PASTEMAC(ch,varname)( \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
@@ -84,28 +84,17 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_
|
||||
return thread;
|
||||
}
|
||||
|
||||
dim_t read_env( char* env )
|
||||
{
|
||||
dim_t number = 1;
|
||||
char* str = getenv( env );
|
||||
if( str != NULL )
|
||||
{
|
||||
number = strtol( str, NULL, 10 );
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads )
|
||||
{
|
||||
}
|
||||
|
||||
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( )
|
||||
{
|
||||
dim_t jc_way = read_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = read_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = read_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = read_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = read_env( "BLIS_IR_NT" );
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
@@ -53,6 +53,10 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t;
|
||||
#define gemm_thread_sub_opackm( thread ) thread->opackm
|
||||
#define gemm_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in gemm micro-kernel
|
||||
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( );
|
||||
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** );
|
||||
|
||||
|
||||
@@ -90,7 +90,8 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
dim_t start, end;
|
||||
|
||||
// Needs to be replaced with a weighted range because triangle
|
||||
bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
|
||||
@@ -89,7 +89,9 @@ void bli_herk_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *ah ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -107,28 +109,34 @@ void bli_herk_int( obj_t* alpha,
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
}
|
||||
}
|
||||
|
||||
// If alpha is non-unit, typecast and apply it to the scalar
|
||||
// attached to A'.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &ah_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( alpha, &ah_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar
|
||||
// attached to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Set a bool based on the uplo field of C's root object.
|
||||
if ( bli_obj_root_is_lower( c_local ) ) uplo = 0;
|
||||
else uplo = 1;
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
|
||||
@@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
@@ -90,11 +90,11 @@ void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads )
|
||||
|
||||
herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
|
||||
{
|
||||
dim_t jc_way = read_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = read_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = read_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = read_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = read_env( "BLIS_IR_NT" );
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
@@ -53,6 +53,11 @@ typedef struct herk_thrinfo_s herk_thrinfo_t;
|
||||
#define herk_thread_sub_opackm( thread ) thread->opackm
|
||||
#define herk_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in herk micro-kernel
|
||||
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
|
||||
herk_thrinfo_t** bli_create_herk_thrinfo_paths( );
|
||||
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths );
|
||||
|
||||
|
||||
@@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
@@ -82,7 +82,8 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
|
||||
@@ -82,7 +82,8 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
|
||||
@@ -111,7 +111,9 @@ void bli_trmm_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -129,22 +131,26 @@ void bli_trmm_int( obj_t* alpha,
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
}
|
||||
}
|
||||
|
||||
// If alpha is non-unit, typecast and apply it to the scalar attached
|
||||
// to B.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar attached
|
||||
// to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
@@ -164,6 +170,8 @@ void bli_trmm_int( obj_t* alpha,
|
||||
else uplo = 1;
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
|
||||
@@ -320,11 +320,11 @@ void PASTEMAC(ch,varname)( \
|
||||
b1_i = b1 + off_a1011 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1011 * ss_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -381,11 +381,11 @@ void PASTEMAC(ch,varname)( \
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
@@ -325,11 +325,11 @@ void PASTEMAC(ch,varname)( \
|
||||
b1_i = b1 + off_a1112 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1112 * ss_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -385,11 +385,11 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
@@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \
|
||||
a1_i = a1 + off_b1121 * PACKMR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + k_b1121 * ss_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
@@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \
|
||||
a1_i = a1 + off_b0111 * PACKMR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + k_b0111 * ss_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
@@ -90,11 +90,11 @@ void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads )
|
||||
|
||||
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( )
|
||||
{
|
||||
dim_t jc_way = read_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = read_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = read_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = read_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = read_env( "BLIS_IR_NT" );
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
@@ -37,20 +37,39 @@
|
||||
void bli_trsm_blk_var1b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b_pack;
|
||||
obj_t c1;
|
||||
obj_t b_pack_s;
|
||||
obj_t a1_pack_s;
|
||||
|
||||
obj_t a1, c1;
|
||||
obj_t* b_pack = NULL;
|
||||
obj_t* a1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offA;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b_pack );
|
||||
// Initialize object for packing B.
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize object for packing B.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
}
|
||||
a1_pack = thread_obroadcast( thread, &a1_pack_s );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
@@ -60,22 +79,16 @@ void bli_trsm_blk_var1b( obj_t* a,
|
||||
// A begins.
|
||||
if ( bli_obj_is_upper( *a ) )
|
||||
offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) -
|
||||
bli_obj_width_after_trans( *a );
|
||||
bli_obj_width_after_trans( *a );
|
||||
|
||||
// Initialize object for packing B.
|
||||
bli_packm_init( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, offA, m_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the remaining portion of the m dimension.
|
||||
for ( i = offA; i < m_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, m_trans, a,
|
||||
b_alg = bli_determine_blocksize_b( i, end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
@@ -84,29 +97,34 @@ void bli_trsm_blk_var1b( obj_t* a,
|
||||
bli_acquire_mpart_b2t( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
//if ( bli_obj_is_zeros( a1 ) ) continue;
|
||||
|
||||
// Initialize object for packing A1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b_pack,
|
||||
a1_pack,
|
||||
b_pack,
|
||||
&BLIS_ONE,
|
||||
&c1,
|
||||
cntl_sub_trsm( cntl ) );
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a1_pack );
|
||||
if( thread_am_ichief( thread ) )
|
||||
bli_obj_release_pack( b_pack );
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trsm_blk_var1b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,20 +37,39 @@
|
||||
void bli_trsm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b_pack;
|
||||
obj_t c1;
|
||||
obj_t b_pack_s;
|
||||
obj_t a1_pack_s;
|
||||
|
||||
obj_t a1, c1;
|
||||
obj_t* b_pack = NULL;
|
||||
obj_t* a1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offA;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b_pack );
|
||||
// Initialize object for packing B.
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize object for packing B.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
}
|
||||
a1_pack = thread_obroadcast( thread, &a1_pack_s );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
@@ -61,20 +80,14 @@ void bli_trsm_blk_var1f( obj_t* a,
|
||||
if ( bli_obj_is_lower( *a ) )
|
||||
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
|
||||
|
||||
// Initialize object for packing B.
|
||||
bli_packm_init( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, offA, m_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the remaining portion of the m dimension.
|
||||
for ( i = offA; i < m_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, m_trans, a,
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
@@ -84,26 +97,33 @@ void bli_trsm_blk_var1f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize object for packing A1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b_pack,
|
||||
a1_pack,
|
||||
b_pack,
|
||||
&BLIS_ONE,
|
||||
&c1,
|
||||
cntl_sub_trsm( cntl ) );
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a1_pack );
|
||||
if( thread_am_ichief( thread ) )
|
||||
bli_obj_release_pack( b_pack );
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trsm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,40 +37,56 @@
|
||||
void bli_trsm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t b1, c1;
|
||||
obj_t* a_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
// Initialize pack objects for A that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, n_trans, b,
|
||||
@@ -83,38 +99,55 @@ void bli_trsm_blk_var2b( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be done before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int( &BLIS_ONE,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
a_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trsm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Currently must be done by 1 thread
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
//Barrier to make sure unpacking is done before next iteration's packing of C
|
||||
//Somehow, we'd like to make this a noop if packing isn't done.
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( b1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trsm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,40 +37,57 @@
|
||||
void bli_trsm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t b1, c1;
|
||||
obj_t* a_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
// Initialize pack objects for A that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
dim_t start, end;
|
||||
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, n_trans, b,
|
||||
@@ -83,38 +100,55 @@ void bli_trsm_blk_var2f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be done before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int( &BLIS_ONE,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
a_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trsm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Currently must be done by 1 thread
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
//Barrier to make sure unpacking is done before next iteration's packing of C
|
||||
//Somehow, we'd like to make this a noop if packing isn't done.
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( b1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trsm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,38 +37,51 @@
|
||||
void bli_trsm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
|
||||
obj_t a1, b1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
// Initialize pack objects for C that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -83,45 +96,60 @@ void bli_trsm_blk_var3b( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be done before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b1_pack,
|
||||
a1_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_trsm( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal alpha scalars on A/B and C are non-zero, we must ensure
|
||||
// that they are only used in the first iteration.
|
||||
if ( i == 0 ) { bli_obj_scalar_reset( a );
|
||||
bli_obj_scalar_reset( b );
|
||||
bli_obj_scalar_reset( &c_pack ); }
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) {
|
||||
bli_obj_scalar_reset( a );
|
||||
bli_obj_scalar_reset( b );
|
||||
bli_obj_scalar_reset( c_pack );
|
||||
}
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_obj_release_pack( c_pack );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( b1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trsm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,38 +37,51 @@
|
||||
void bli_trsm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
|
||||
obj_t a1, b1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
// Initialize pack objects for C that are passed into packm_init().
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trsm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -83,45 +96,60 @@ void bli_trsm_blk_var3f( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trsm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be done before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b1_pack,
|
||||
a1_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_trsm( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal alpha scalars on A/B and C are non-zero, we must ensure
|
||||
// that they are only used in the first iteration.
|
||||
if ( i == 0 ) { bli_obj_scalar_reset( a );
|
||||
bli_obj_scalar_reset( b );
|
||||
bli_obj_scalar_reset( &c_pack ); }
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) {
|
||||
bli_obj_scalar_reset( a );
|
||||
bli_obj_scalar_reset( b );
|
||||
bli_obj_scalar_reset( c_pack );
|
||||
}
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_obj_release_pack( c_pack );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( b1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trsm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -125,12 +125,20 @@ void bli_trsm_front( side_t side,
|
||||
if ( bli_is_left( side ) ) cntl = l_cntl;
|
||||
else cntl = r_cntl;
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_trsm_int( alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
alpha,
|
||||
&c_local,
|
||||
cntl );
|
||||
trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_trsm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
alpha,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_trsm_thrinfo_free_paths( infos );
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[2][2][4][3] =
|
||||
{
|
||||
@@ -88,7 +89,8 @@ void bli_trsm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
@@ -109,7 +111,9 @@ void bli_trsm_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -127,14 +131,17 @@ void bli_trsm_int( obj_t* alpha,
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
}
|
||||
}
|
||||
|
||||
// If beta is non-unit, apply it to the scalar attached to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
@@ -150,7 +157,8 @@ void bli_trsm_int( obj_t* alpha,
|
||||
// attached to B (the non-triangular matrix).
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
}
|
||||
else // if ( bli_obj_root_is_triangular( *b ) )
|
||||
@@ -164,10 +172,13 @@ void bli_trsm_int( obj_t* alpha,
|
||||
// attached to A (the non-triangular matrix).
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &a_local );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_scalar_apply_scalar( alpha, &a_local );
|
||||
}
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
@@ -179,6 +190,7 @@ void bli_trsm_int( obj_t* alpha,
|
||||
f( &a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
cntl,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -37,4 +37,5 @@ void bli_trsm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemmtrsm_ukr,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trsm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
|
||||
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
|
||||
void bli_trsm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -139,7 +141,8 @@ void bli_trsm_ll_ker_var2( obj_t* a,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemmtrsm_ukr,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernels' addresses to their function pointer types. */ \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trsm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 )
|
||||
|
||||
@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemmtrsm_ukr,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trsm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
|
||||
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
|
||||
void bli_trsm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -139,7 +141,8 @@ void bli_trsm_lu_ker_var2( obj_t* a,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemmtrsm_ukr,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernels' addresses to their function pointer types. */ \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trsm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 )
|
||||
|
||||
@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemmtrsm_ukr,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trsm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
|
||||
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
|
||||
void bli_trsm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -139,7 +141,8 @@ void bli_trsm_rl_ker_var2( obj_t* a,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemmtrsm_ukr,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernels' addresses to their function pointer types. */ \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trsm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 )
|
||||
|
||||
@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemmtrsm_ukr,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trsm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
|
||||
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
|
||||
void bli_trsm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl )
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -139,7 +141,8 @@ void bli_trsm_ru_ker_var2( obj_t* a,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemmtrsm_ukr,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernels' addresses to their function pointer types. */ \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trsm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trsm_t* cntl );
|
||||
trsm_t* cntl,
|
||||
trsm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemmtrsm_ukr, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trsm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 )
|
||||
|
||||
173
frame/3/trsm/bli_trsm_threading.c
Normal file
173
frame/3/trsm/bli_trsm_threading.c
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trsm_thrinfo_t* sub_trsm )
|
||||
{
|
||||
thread->ocomm = ocomm;
|
||||
thread->ocomm_id = ocomm_id;
|
||||
thread->icomm = icomm;
|
||||
thread->icomm_id = icomm_id;
|
||||
thread->n_way = n_way;
|
||||
thread->work_id = work_id;
|
||||
thread->opackm = opackm;
|
||||
thread->ipackm = ipackm;
|
||||
thread->sub_trsm = sub_trsm;
|
||||
}
|
||||
|
||||
void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->sub_trsm = thread;
|
||||
}
|
||||
|
||||
trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trsm_thrinfo_t* sub_trsm )
|
||||
{
|
||||
trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc( sizeof( trsm_thrinfo_t ) );
|
||||
bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_trsm );
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads )
|
||||
{
|
||||
}
|
||||
|
||||
trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( )
|
||||
{
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
|
||||
dim_t kc_nt = ic_way * jr_way * ir_way;
|
||||
dim_t ic_nt = jr_way * ir_way;
|
||||
dim_t jr_nt = ir_way;
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
trsm_thrinfo_t** paths = (trsm_thrinfo_t**) malloc( global_num_threads * sizeof( trsm_thrinfo_t* ) );
|
||||
|
||||
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++)
|
||||
{
|
||||
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
trsm_thrinfo_t* jr_info = bli_create_trsm_thrinfo_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
|
||||
trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
|
||||
trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
79
frame/3/trsm/bli_trsm_threading.h
Normal file
79
frame/3/trsm/bli_trsm_threading.h
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
struct trsm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct trsm_thrinfo_s* sub_trsm;
|
||||
};
|
||||
typedef struct trsm_thrinfo_s trsm_thrinfo_t;
|
||||
|
||||
#define trsm_thread_sub_trsm( thread ) thread->sub_trsm
|
||||
#define trsm_thread_sub_opackm( thread ) thread->opackm
|
||||
#define trsm_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
#define trsm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trsm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trsm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trsm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( );
|
||||
void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** );
|
||||
|
||||
void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trsm_thrinfo_t* sub_trsm );
|
||||
|
||||
trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trsm_thrinfo_t* sub_trsm );
|
||||
|
||||
void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread );
|
||||
@@ -230,8 +230,51 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
|
||||
*end = bli_min( *start + n_pt, size + all_start );
|
||||
}
|
||||
|
||||
void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
|
||||
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
|
||||
{
|
||||
thrinfo_t* thread = (thrinfo_t*) thr;
|
||||
dim_t n_way = thread->n_way;
|
||||
dim_t work_id = thread->work_id;
|
||||
dim_t size = all_end - all_start;
|
||||
|
||||
*start = all_start;
|
||||
*end = all_end;
|
||||
|
||||
if( forward ) {
|
||||
dim_t curr_caucus = n_way - 1;
|
||||
dim_t len = 0;
|
||||
dim_t num = size*size / n_way; // 2xArea per thread?
|
||||
while(1){
|
||||
dim_t width = sqrt( len*len + num ) - len; // The width of the current caucus
|
||||
width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
|
||||
if( curr_caucus == work_id ) {
|
||||
if( *end > width )
|
||||
*start = *end - width;
|
||||
return;
|
||||
}
|
||||
else{
|
||||
*end -= width;
|
||||
len += width;
|
||||
curr_caucus--;
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
dim_t len = *end - *start;
|
||||
dim_t num = len * len / n_way;
|
||||
while(1){
|
||||
dim_t width = sqrt(*start * *start + num) - *start;
|
||||
width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
|
||||
if(!work_id) {
|
||||
*end = bli_min( *start + width, *end );
|
||||
return;
|
||||
}
|
||||
else{
|
||||
*start = *start + width;
|
||||
}
|
||||
work_id--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_level3_thread_decorator( dim_t n_threads,
|
||||
@@ -257,3 +300,14 @@ void bli_level3_thread_decorator( dim_t n_threads,
|
||||
thread[omp_id] );
|
||||
}
|
||||
}
|
||||
|
||||
dim_t bli_read_nway_from_env( char* env )
|
||||
{
|
||||
dim_t number = 1;
|
||||
char* str = getenv( env );
|
||||
if( str != NULL )
|
||||
{
|
||||
number = strtol( str, NULL, 10 );
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
@@ -88,11 +88,13 @@ typedef struct thrinfo_s thrinfo_t;
|
||||
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
|
||||
|
||||
void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
|
||||
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
|
||||
|
||||
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
dim_t bli_read_nway_from_env( char* env );
|
||||
//void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm );
|
||||
//thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels );
|
||||
|
||||
@@ -100,6 +102,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm
|
||||
#include "bli_gemm_threading.h"
|
||||
#include "bli_herk_threading.h"
|
||||
#include "bli_trmm_threading.h"
|
||||
#include "bli_trsm_threading.h"
|
||||
|
||||
typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread );
|
||||
void bli_level3_thread_decorator( dim_t num_threads,
|
||||
|
||||
Reference in New Issue
Block a user