Initial commit to enable threading in TRSM,

Also enabled weighted partitioning for herk, trmm
Fixed bug where multiple threads would try to modify the same state in the internal level 3 functions
Correctly computed a_next and b_next for gemm, herk macrokernels
a_next and b_next point to the current micropanels in trmm
This commit is contained in:
Tyler Smith
2014-03-24 15:21:42 -05:00
parent 23d9eab354
commit f0824a04fc
46 changed files with 889 additions and 342 deletions

View File

@@ -115,14 +115,6 @@ void bli_gemm_blk_var3f( obj_t* a,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
gemm_thread_sub_ipackm( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
// only for the first iteration (and then BLIS_ONE for all others).
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
// Packing must be done before computation.
thread_ibarrier( thread );
@@ -136,6 +128,14 @@ void bli_gemm_blk_var3f( obj_t* a,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
// only for the first iteration (and then BLIS_ONE for all others).
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
}
thread_obarrier( thread );

View File

@@ -79,7 +79,9 @@ void bli_gemm_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -88,7 +90,9 @@ void bli_gemm_int( obj_t* alpha,
if ( bli_obj_is_zeros( *a ) ||
bli_obj_is_zeros( *b ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -106,23 +110,28 @@ void bli_gemm_int( obj_t* alpha,
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
if( thread_am_ochief( thread ) ) {
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
}
}
// If alpha is non-unit, typecast and apply it to the scalar attached
// to B.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( beta, &c_local );
}
thread_obarrier( thread );
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );

View File

@@ -249,11 +249,11 @@ void PASTEMAC(ch,varname)( \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \

View File

@@ -84,28 +84,17 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_
return thread;
}
dim_t read_env( char* env )
{
dim_t number = 1;
char* str = getenv( env );
if( str != NULL )
{
number = strtol( str, NULL, 10 );
}
return number;
}
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads )
{
}
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( )
{
dim_t jc_way = read_env( "BLIS_JC_NT" );
dim_t kc_way = read_env( "BLIS_KC_NT" );
dim_t ic_way = read_env( "BLIS_IC_NT" );
dim_t jr_way = read_env( "BLIS_JR_NT" );
dim_t ir_way = read_env( "BLIS_IR_NT" );
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );

View File

@@ -53,6 +53,10 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t;
#define gemm_thread_sub_opackm( thread ) thread->opackm
#define gemm_thread_sub_ipackm( thread ) thread->ipackm
// For use in gemm micro-kernel
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( );
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** );

View File

@@ -90,7 +90,8 @@ void bli_herk_blk_var2f( obj_t* a,
dim_t start, end;
// Needs to be replaced with a weighted range because triangle
bli_get_range( thread, 0, n_trans, 8, &start, &end );
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -89,7 +89,9 @@ void bli_herk_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *ah ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -107,28 +109,34 @@ void bli_herk_int( obj_t* alpha,
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
if( thread_am_ochief( thread ) ) {
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
}
}
// If alpha is non-unit, typecast and apply it to the scalar
// attached to A'.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &ah_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( alpha, &ah_local );
}
// If beta is non-unit, typecast and apply it to the scalar
// attached to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Set a bool based on the uplo field of C's root object.
if ( bli_obj_root_is_lower( c_local ) ) uplo = 0;
else uplo = 1;
thread_obarrier( thread );
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );

View File

@@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \

View File

@@ -90,11 +90,11 @@ void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads )
herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
{
dim_t jc_way = read_env( "BLIS_JC_NT" );
dim_t kc_way = read_env( "BLIS_KC_NT" );
dim_t ic_way = read_env( "BLIS_IC_NT" );
dim_t jr_way = read_env( "BLIS_JR_NT" );
dim_t ir_way = read_env( "BLIS_IR_NT" );
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );

View File

@@ -53,6 +53,11 @@ typedef struct herk_thrinfo_s herk_thrinfo_t;
#define herk_thread_sub_opackm( thread ) thread->opackm
#define herk_thread_sub_ipackm( thread ) thread->ipackm
// For use in herk micro-kernel
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
herk_thrinfo_t** bli_create_herk_thrinfo_paths( );
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths );

View File

@@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \

View File

@@ -82,7 +82,8 @@ void bli_trmm_blk_var2b( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range( thread, 0, n_trans, 8, &start, &end );
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -82,7 +82,8 @@ void bli_trmm_blk_var2f( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range( thread, 0, n_trans, 8, &start, &end );
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -111,7 +111,9 @@ void bli_trmm_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -129,22 +131,26 @@ void bli_trmm_int( obj_t* alpha,
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
if( thread_am_ochief( thread ) ) {
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
}
}
// If alpha is non-unit, typecast and apply it to the scalar attached
// to B.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Set two bools: one based on the implied side parameter (the structure
@@ -164,6 +170,8 @@ void bli_trmm_int( obj_t* alpha,
else uplo = 1;
}
thread_obarrier( thread );
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );

View File

@@ -320,11 +320,11 @@ void PASTEMAC(ch,varname)( \
b1_i = b1 + off_a1011 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1011 * ss_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -381,11 +381,11 @@ void PASTEMAC(ch,varname)( \
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \

View File

@@ -325,11 +325,11 @@ void PASTEMAC(ch,varname)( \
b1_i = b1 + off_a1112 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1112 * ss_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -385,11 +385,11 @@ void PASTEMAC(ch,varname)( \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \

View File

@@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \
a1_i = a1 + off_b1121 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + k_b1121 * ss_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \

View File

@@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \
a1_i = a1 + off_b0111 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + k_b0111 * ss_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \

View File

@@ -90,11 +90,11 @@ void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads )
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( )
{
dim_t jc_way = read_env( "BLIS_JC_NT" );
dim_t kc_way = read_env( "BLIS_KC_NT" );
dim_t ic_way = read_env( "BLIS_IC_NT" );
dim_t jr_way = read_env( "BLIS_JR_NT" );
dim_t ir_way = read_env( "BLIS_IR_NT" );
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );

View File

@@ -37,20 +37,39 @@
void bli_trsm_blk_var1b( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b_pack;
obj_t c1;
obj_t b_pack_s;
obj_t a1_pack_s;
obj_t a1, c1;
obj_t* b_pack = NULL;
obj_t* a1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offA;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b_pack );
// Initialize object for packing B.
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntl_sub_packm_b( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
// Initialize object for packing B.
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
}
a1_pack = thread_obroadcast( thread, &a1_pack_s );
// Pack B1 (if instructed).
bli_packm_int( b, b_pack,
cntl_sub_packm_b( cntl ),
trsm_thread_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
m_trans = bli_obj_length_after_trans( *a );
@@ -60,22 +79,16 @@ void bli_trsm_blk_var1b( obj_t* a,
// A begins.
if ( bli_obj_is_upper( *a ) )
offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) -
bli_obj_width_after_trans( *a );
bli_obj_width_after_trans( *a );
// Initialize object for packing B.
bli_packm_init( b, &b_pack,
cntl_sub_packm_b( cntl ) );
// Pack B1 (if instructed).
bli_packm_int( b, &b_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
dim_t start, end;
bli_get_range( thread, offA, m_trans, 8, &start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = offA; i < m_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, m_trans, a,
b_alg = bli_determine_blocksize_b( i, end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
@@ -84,29 +97,34 @@ void bli_trsm_blk_var1b( obj_t* a,
bli_acquire_mpart_b2t( BLIS_SUBPART1,
i, b_alg, c, &c1 );
//if ( bli_obj_is_zeros( a1 ) ) continue;
// Initialize object for packing A1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Perform trsm subproblem.
bli_trsm_int( &BLIS_ONE,
&a1_pack,
&b_pack,
a1_pack,
b_pack,
&BLIS_ONE,
&c1,
cntl_sub_trsm( cntl ) );
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a1_pack );
if( thread_am_ichief( thread ) )
bli_obj_release_pack( b_pack );
}

View File

@@ -35,5 +35,6 @@
void bli_trsm_blk_var1b( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );

View File

@@ -37,20 +37,39 @@
void bli_trsm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b_pack;
obj_t c1;
obj_t b_pack_s;
obj_t a1_pack_s;
obj_t a1, c1;
obj_t* b_pack = NULL;
obj_t* a1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offA;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b_pack );
// Initialize object for packing B.
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntl_sub_packm_b( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
// Initialize object for packing B.
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
}
a1_pack = thread_obroadcast( thread, &a1_pack_s );
// Pack B1 (if instructed).
bli_packm_int( b, b_pack,
cntl_sub_packm_b( cntl ),
trsm_thread_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
m_trans = bli_obj_length_after_trans( *a );
@@ -61,20 +80,14 @@ void bli_trsm_blk_var1f( obj_t* a,
if ( bli_obj_is_lower( *a ) )
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
// Initialize object for packing B.
bli_packm_init( b, &b_pack,
cntl_sub_packm_b( cntl ) );
// Pack B1 (if instructed).
bli_packm_int( b, &b_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
dim_t start, end;
bli_get_range( thread, offA, m_trans, 8, &start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = offA; i < m_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
b_alg = bli_determine_blocksize_f( i, end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
@@ -84,26 +97,33 @@ void bli_trsm_blk_var1f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize object for packing A1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Perform trsm subproblem.
bli_trsm_int( &BLIS_ONE,
&a1_pack,
&b_pack,
a1_pack,
b_pack,
&BLIS_ONE,
&c1,
cntl_sub_trsm( cntl ) );
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a1_pack );
if( thread_am_ichief( thread ) )
bli_obj_release_pack( b_pack );
}

View File

@@ -35,5 +35,6 @@
void bli_trsm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );

View File

@@ -37,40 +37,56 @@
void bli_trsm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
obj_t a_pack;
obj_t b1, b1_pack;
obj_t c1, c1_pack;
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
obj_t b1, c1;
obj_t* a_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c1_pack );
// Initialize pack objects for A that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &a_pack_s );
// Initialize object for packing A.
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
dim_t start, end;
bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, n_trans, b,
@@ -83,38 +99,55 @@ void bli_trsm_blk_var2b( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Packing must be done before computation
thread_ibarrier( thread );
// Perform trsm subproblem.
bli_trsm_int( &BLIS_ONE,
&a_pack,
&b1_pack,
a_pack,
b1_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trsm( cntl ) );
c1_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Currently must be done by 1 thread
if( thread_am_ichief( thread ) ) {
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
}
//Barrier to make sure unpacking is done before next iteration's packing of C
//Somehow, we'd like to make this a noop if packing isn't done.
thread_ibarrier( thread );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( b1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trsm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );

View File

@@ -37,40 +37,57 @@
void bli_trsm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
obj_t a_pack;
obj_t b1, b1_pack;
obj_t c1, c1_pack;
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
obj_t b1, c1;
obj_t* a_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c1_pack );
// Initialize pack objects for A that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &a_pack_s );
// Initialize object for packing A.
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
dim_t start, end;
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, b,
@@ -83,38 +100,55 @@ void bli_trsm_blk_var2f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Packing must be done before computation
thread_ibarrier( thread );
// Perform trsm subproblem.
bli_trsm_int( &BLIS_ONE,
&a_pack,
&b1_pack,
a_pack,
b1_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trsm( cntl ) );
c1_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Currently must be done by 1 thread
if( thread_am_ichief( thread ) ) {
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
}
//Barrier to make sure unpacking is done before next iteration's packing of C
//Somehow, we'd like to make this a noop if packing isn't done.
thread_ibarrier( thread );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( b1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trsm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );

View File

@@ -37,38 +37,51 @@
void bli_trsm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b1, b1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
obj_t a1, b1;
obj_t* a1_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c_pack );
// Initialize pack objects for C that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &c_pack_s );
// Initialize object for packing C.
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
trsm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -83,45 +96,60 @@ void bli_trsm_blk_var3b( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Packing must be done before computation
thread_ibarrier( thread );
// Perform trsm subproblem.
bli_trsm_int( &BLIS_ONE,
&a1_pack,
&b1_pack,
a1_pack,
b1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_trsm( cntl ) );
c_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure
// that they are only used in the first iteration.
if ( i == 0 ) { bli_obj_scalar_reset( a );
bli_obj_scalar_reset( b );
bli_obj_scalar_reset( &c_pack ); }
if ( i == 0 && thread_am_ichief( thread ) ) {
bli_obj_scalar_reset( a );
bli_obj_scalar_reset( b );
bli_obj_scalar_reset( c_pack );
}
}
thread_obarrier( thread );
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
if( thread_am_ochief( thread ) ) {
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ) );
bli_obj_release_pack( c_pack );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( b1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trsm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );

View File

@@ -37,38 +37,51 @@
void bli_trsm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b1, b1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
obj_t a1, b1;
obj_t* a1_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c_pack );
// Initialize pack objects for C that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &c_pack_s );
// Initialize object for packing C.
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
trsm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -83,45 +96,60 @@ void bli_trsm_blk_var3f( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trsm_thread_sub_ipackm( thread ) );
// Packing must be done before computation
thread_ibarrier( thread );
// Perform trsm subproblem.
bli_trsm_int( &BLIS_ONE,
&a1_pack,
&b1_pack,
a1_pack,
b1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_trsm( cntl ) );
c_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure
// that they are only used in the first iteration.
if ( i == 0 ) { bli_obj_scalar_reset( a );
bli_obj_scalar_reset( b );
bli_obj_scalar_reset( &c_pack ); }
if ( i == 0 && thread_am_ichief( thread ) ) {
bli_obj_scalar_reset( a );
bli_obj_scalar_reset( b );
bli_obj_scalar_reset( c_pack );
}
}
thread_obarrier( thread );
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
if( thread_am_ochief( thread ) ) {
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ) );
bli_obj_release_pack( c_pack );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( b1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trsm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );

View File

@@ -125,12 +125,20 @@ void bli_trsm_front( side_t side,
if ( bli_is_left( side ) ) cntl = l_cntl;
else cntl = r_cntl;
// Invoke the internal back-end.
bli_trsm_int( alpha,
&a_local,
&b_local,
alpha,
&c_local,
cntl );
trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trsm_int,
alpha,
&a_local,
&b_local,
alpha,
&c_local,
(void*) cntl,
(void**) infos );
bli_trsm_thrinfo_free_paths( infos );
}

View File

@@ -39,7 +39,8 @@
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );
static FUNCPTR_T vars[2][2][4][3] =
{
@@ -88,7 +89,8 @@ void bli_trsm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
obj_t a_local;
obj_t b_local;
@@ -109,7 +111,9 @@ void bli_trsm_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -127,14 +131,17 @@ void bli_trsm_int( obj_t* alpha,
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
if( thread_am_ochief( thread ) ) {
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
}
}
// If beta is non-unit, apply it to the scalar attached to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Set two bools: one based on the implied side parameter (the structure
@@ -150,7 +157,8 @@ void bli_trsm_int( obj_t* alpha,
// attached to B (the non-triangular matrix).
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
}
else // if ( bli_obj_root_is_triangular( *b ) )
@@ -164,10 +172,13 @@ void bli_trsm_int( obj_t* alpha,
// attached to A (the non-triangular matrix).
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &a_local );
if( thread_am_ochief( thread ) )
bli_obj_scalar_apply_scalar( alpha, &a_local );
}
}
thread_obarrier( thread );
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
@@ -179,6 +190,7 @@ void bli_trsm_int( obj_t* alpha,
f( &a_local,
&b_local,
&c_local,
cntl );
cntl,
thread );
}

View File

@@ -37,4 +37,5 @@ void bli_trsm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );

View File

@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
void* gemmtrsm_ukr,
void* gemm_ukr
void* gemm_ukr,
trsm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
void bli_trsm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -139,7 +141,8 @@ void bli_trsm_ll_ker_var2( obj_t* a,
buf_alpha2,
buf_c, rs_c, cs_c,
gemmtrsm_ukr,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
) \
{ \
/* Cast the micro-kernels' addresses to their function pointer types. */ \

View File

@@ -39,7 +39,8 @@
void bli_trsm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );
//
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 )

View File

@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
void* gemmtrsm_ukr,
void* gemm_ukr
void* gemm_ukr,
trsm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
void bli_trsm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -139,7 +141,8 @@ void bli_trsm_lu_ker_var2( obj_t* a,
buf_alpha2,
buf_c, rs_c, cs_c,
gemmtrsm_ukr,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
) \
{ \
/* Cast the micro-kernels' addresses to their function pointer types. */ \

View File

@@ -39,7 +39,8 @@
void bli_trsm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );
//
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 )

View File

@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
void* gemmtrsm_ukr,
void* gemm_ukr
void* gemm_ukr,
trsm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
void bli_trsm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -139,7 +141,8 @@ void bli_trsm_rl_ker_var2( obj_t* a,
buf_alpha2,
buf_c, rs_c, cs_c,
gemmtrsm_ukr,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
) \
{ \
/* Cast the micro-kernels' addresses to their function pointer types. */ \

View File

@@ -39,7 +39,8 @@
void bli_trsm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );
//
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 )

View File

@@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)(
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
void* gemmtrsm_ukr,
void* gemm_ukr
void* gemm_ukr,
trsm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
@@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
void bli_trsm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl )
trsm_t* cntl,
trsm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -139,7 +141,8 @@ void bli_trsm_ru_ker_var2( obj_t* a,
buf_alpha2,
buf_c, rs_c, cs_c,
gemmtrsm_ukr,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
) \
{ \
/* Cast the micro-kernels' addresses to their function pointer types. */ \

View File

@@ -39,7 +39,8 @@
void bli_trsm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trsm_t* cntl );
trsm_t* cntl,
trsm_thrinfo_t* thread );
//
@@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemmtrsm_ukr, \
void* gemm_ukr \
void* gemm_ukr, \
trsm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 )

View File

@@ -0,0 +1,173 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "assert.h"
void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trsm_thrinfo_t* sub_trsm )
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->opackm = opackm;
thread->ipackm = ipackm;
thread->sub_trsm = sub_trsm;
}
void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
thread->sub_trsm = thread;
}
trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trsm_thrinfo_t* sub_trsm )
{
trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc( sizeof( trsm_thrinfo_t ) );
bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
opackm,
ipackm,
sub_trsm );
return thread;
}
void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads )
{
}
trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( )
{
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
dim_t kc_nt = ic_way * jr_way * ir_way;
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
trsm_thrinfo_t** paths = (trsm_thrinfo_t**) malloc( global_num_threads * sizeof( trsm_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
for( int e = 0; e < ir_way; e++)
{
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL, NULL, NULL);
trsm_thrinfo_t* jr_info = bli_create_trsm_thrinfo_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
paths[global_comm_id] = jc_info;
}
}
}
}
}
return paths;
}

View File

@@ -0,0 +1,79 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct trsm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct trsm_thrinfo_s* sub_trsm;
};
typedef struct trsm_thrinfo_s trsm_thrinfo_t;
#define trsm_thread_sub_trsm( thread ) thread->sub_trsm
#define trsm_thread_sub_opackm( thread ) thread->opackm
#define trsm_thread_sub_ipackm( thread ) thread->ipackm
#define trsm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trsm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trsm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trsm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( );
void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** );
void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trsm_thrinfo_t* sub_trsm );
trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trsm_thrinfo_t* sub_trsm );
void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread );

View File

@@ -230,8 +230,51 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
*end = bli_min( *start + n_pt, size + all_start );
}
void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
{
thrinfo_t* thread = (thrinfo_t*) thr;
dim_t n_way = thread->n_way;
dim_t work_id = thread->work_id;
dim_t size = all_end - all_start;
*start = all_start;
*end = all_end;
if( forward ) {
dim_t curr_caucus = n_way - 1;
dim_t len = 0;
dim_t num = size*size / n_way; // 2xArea per thread?
while(1){
dim_t width = sqrt( len*len + num ) - len; // The width of the current caucus
width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
if( curr_caucus == work_id ) {
if( *end > width )
*start = *end - width;
return;
}
else{
*end -= width;
len += width;
curr_caucus--;
}
}
}
else{
dim_t len = *end - *start;
dim_t num = len * len / n_way;
while(1){
dim_t width = sqrt(*start * *start + num) - *start;
width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
if(!work_id) {
*end = bli_min( *start + width, *end );
return;
}
else{
*start = *start + width;
}
work_id--;
}
}
}
void bli_level3_thread_decorator( dim_t n_threads,
@@ -257,3 +300,14 @@ void bli_level3_thread_decorator( dim_t n_threads,
thread[omp_id] );
}
}
dim_t bli_read_nway_from_env( char* env )
{
dim_t number = 1;
char* str = getenv( env );
if( str != NULL )
{
number = strtol( str, NULL, 10 );
}
return number;
}

View File

@@ -88,11 +88,13 @@ typedef struct thrinfo_s thrinfo_t;
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
dim_t bli_read_nway_from_env( char* env );
//void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm );
//thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels );
@@ -100,6 +102,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm
#include "bli_gemm_threading.h"
#include "bli_herk_threading.h"
#include "bli_trmm_threading.h"
#include "bli_trsm_threading.h"
typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread );
void bli_level3_thread_decorator( dim_t num_threads,