Parallelized trmm and trmm3

Also fixed bugs in packm
This commit is contained in:
Tyler Smith
2014-03-20 16:43:36 -05:00
parent c0140cb752
commit 5d5dc2eede
40 changed files with 897 additions and 403 deletions

View File

@@ -263,18 +263,14 @@ void PASTEMAC(ch,varname )( \
} \
\
p_begin = p_cast; \
dim_t t_id = thread_id( thread ); \
dim_t num_threads = thread_num_threads( thread ); \
p_inc = ps_p; \
\
for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \
ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
p_begin = p_cast + (ip )*p_inc; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
@@ -323,6 +319,8 @@ void PASTEMAC(ch,varname )( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk)( strucc, \
diagoffp_i, \
diagc, \
@@ -336,6 +334,7 @@ void PASTEMAC(ch,varname )( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
}\
\
\
p_inc = ldp * panel_len_max_i; \
@@ -349,6 +348,8 @@ void PASTEMAC(ch,varname )( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk)( strucc, \
diagoffc_i, \
uploc, \
@@ -360,6 +361,7 @@ void PASTEMAC(ch,varname )( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -373,6 +375,8 @@ void PASTEMAC(ch,varname )( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -384,10 +388,13 @@ void PASTEMAC(ch,varname )( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
\
} \
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
} \
} \
\
\
p_begin += p_inc; \
} \
\
\

View File

@@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \
} \
\
p_begin = p_cast; \
dim_t t_id = thread_id( thread ); \
dim_t num_threads = thread_num_threads( thread ); \
p_inc = ps_p; \
\
for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \
ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
p_begin = p_cast + (ip )*p_inc; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
@@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \
diagoffp_i, \
diagc, \
@@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
\
\
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
@@ -399,6 +398,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \
diagoffc_i, \
uploc, \
@@ -411,6 +412,7 @@ void PASTEMAC(ch,varname)( \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
\
} \
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
} \
@@ -423,6 +425,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -434,6 +438,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
@@ -448,6 +453,8 @@ void PASTEMAC(ch,varname)( \
*/ \
\
} \
\
p_begin += p_inc; \
} \
}

View File

@@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \
} \
\
p_begin = p_cast; \
dim_t t_id = thread_id( thread ); \
dim_t num_threads = thread_num_threads( thread ); \
p_inc = ps_p; \
\
for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \
ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
p_begin = p_cast + (ip )*p_inc; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
@@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \
diagoffp_i, \
diagc, \
@@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
\
p_inc = ldp * panel_len_max_i; \
\
@@ -406,6 +405,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \
diagoffc_i, \
uploc, \
@@ -417,6 +418,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -430,6 +432,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -441,6 +445,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -463,6 +468,8 @@ void PASTEMAC(ch,varname)( \
*/ \
\
} \
\
p_begin += p_inc; \
} \
}

View File

@@ -44,6 +44,8 @@ struct packm_thrinfo_s //implements thrinfo_t
};
typedef struct packm_thrinfo_s packm_thrinfo_t;
#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,

View File

@@ -83,7 +83,7 @@ void bli_gemm_blk_var1f( obj_t* a,
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *a );
dim_t start, end;
bli_get_range( thread, m_trans, 8, &start, &end );
bli_get_range( thread, 0, m_trans, 8, &start, &end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -82,7 +82,7 @@ void bli_gemm_blk_var2f( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range( thread, n_trans, 8, &start, &end );
bli_get_range( thread, 0, n_trans, 8, &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -85,7 +85,7 @@ void bli_hemm_front( side_t side,
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_gemm_int,
(level3_int_t) bli_gemm_int,
alpha,
&a_local,
&b_local,

View File

@@ -116,7 +116,7 @@ void bli_her2k_front( obj_t* alpha,
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_herk_int,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&bh_local,
@@ -126,7 +126,7 @@ void bli_her2k_front( obj_t* alpha,
(void**) infos );
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_herk_int,
(level3_int_t) bli_herk_int,
&alpha_conj,
&b_local,
&ah_local,

View File

@@ -82,7 +82,7 @@ void bli_herk_blk_var1f( obj_t* a,
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *c );
dim_t start, end;
bli_get_range( thread, m_trans, 8, &start, &end );
bli_get_range( thread, 0, m_trans, 8, &start, &end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -90,7 +90,7 @@ void bli_herk_blk_var2f( obj_t* a,
dim_t start, end;
// Needs to be replaced with a weighted range because triangle
bli_get_range( thread, n_trans, 8, &start, &end );
bli_get_range( thread, 0, n_trans, 8, &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -82,7 +82,7 @@ void bli_herk_front( obj_t* alpha,
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_herk_int,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&ah_local,

View File

@@ -84,7 +84,7 @@ void bli_symm_front( side_t side,
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_gemm_int,
(level3_int_t) bli_gemm_int,
alpha,
&a_local,
&b_local,

View File

@@ -98,7 +98,7 @@ void bli_syr2k_front( obj_t* alpha,
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_herk_int,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&bt_local,
@@ -108,7 +108,7 @@ void bli_syr2k_front( obj_t* alpha,
(void**) infos );
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_herk_int,
(level3_int_t) bli_herk_int,
alpha,
&b_local,
&at_local,

View File

@@ -78,7 +78,7 @@ void bli_syrk_front( obj_t* alpha,
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t*) bli_herk_int,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&at_local,

View File

@@ -37,21 +37,48 @@
void bli_trmm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b_pack;
obj_t c1, c1_pack;
obj_t b_pack_s;
obj_t a1_pack_s, c1_pack_s;
obj_t a1, c1;
obj_t* a1_pack = NULL;
obj_t* b_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offA;
if( thread_am_ochief( thread ) ) {
// Initialize object for packing B.
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack B (if instructed).
bli_packm_int( b, b_pack,
cntl_sub_packm_b( cntl ),
trmm_thread_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
m_trans = bli_obj_length_after_trans( *a );
@@ -66,25 +93,14 @@ void bli_trmm_blk_var1f( obj_t* a,
m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing B.
bli_packm_init( b, &b_pack,
cntl_sub_packm_b( cntl ) );
// Pack B (if instructed).
bli_packm_int( b, &b_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
dim_t start, end;
bli_get_range( thread, offA, m_trans, 8, &start, &end );
// Partition along the m dimension.
for ( i = offA; i < m_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
b_alg = bli_determine_blocksize_f( i, end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
@@ -94,38 +110,55 @@ void bli_trmm_blk_var1f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and C1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Packing must be finished before computation
thread_ibarrier( thread );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a1_pack,
&b_pack,
a1_pack,
b_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trmm( cntl ) );
c1_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Currently must be done by 1 thread
if( thread_am_ichief( thread ) ) {
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
}
//Barrier to make sure unpacking is done before next iteration's packing of C
//Somehow, we'd like to make this a noop if packing isn't done.
thread_ibarrier( thread );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( b_pack );
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,43 +37,58 @@
void bli_trmm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread)
{
obj_t a_pack;
obj_t b1, b1_pack;
obj_t c1, c1_pack;
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
obj_t b1, c1;
obj_t* a_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
dim_t start, end;
bli_get_range( thread, 0, n_trans, 8, &start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, n_trans, b,
b_alg = bli_determine_blocksize_b( i, end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.
@@ -83,38 +98,55 @@ void bli_trmm_blk_var2b( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Packing must be finished before computation
thread_ibarrier( thread );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a_pack,
&b1_pack,
a_pack,
b1_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trmm( cntl ) );
c1_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread
if( thread_am_ichief( thread ) ) {
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
}
//Barrier to make sure unpacking is done before next iteration's packing of C
//Somehow, we'd like to make this a noop if packing isn't done.
thread_ibarrier( thread );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( b1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,43 +37,58 @@
void bli_trmm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread)
{
obj_t a_pack;
obj_t b1, b1_pack;
obj_t c1, c1_pack;
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
obj_t b1, c1;
obj_t* a_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
dim_t start, end;
bli_get_range( thread, 0, n_trans, 8, &start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, b,
b_alg = bli_determine_blocksize_f( i, end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.
@@ -83,38 +98,55 @@ void bli_trmm_blk_var2f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Packing must be finished before computation
thread_ibarrier( thread );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a_pack,
&b1_pack,
a_pack,
b1_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trmm( cntl ) );
c1_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread
if( thread_am_ichief( thread ) ) {
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
}
//Barrier to make sure unpacking is done before next iteration's packing of C
//Somehow, we'd like to make this a noop if packing isn't done.
thread_ibarrier( thread );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( b1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,38 +37,50 @@
void bli_trmm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b1, b1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
obj_t a1, b1;
obj_t* a1_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c_pack );
if( thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
// Initialize pack objects for A and B that are passed into packm_init().
if( thread_am_ichief( thread ) ){
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -83,38 +95,51 @@ void bli_trmm_blk_var3b( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Packing must be done before computation
thread_ibarrier( thread );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a1_pack,
&b1_pack,
a1_pack,
b1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_trmm( cntl ) );
c_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
}
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
thread_obarrier( thread );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c_pack );
// Unpack C (if C was packed).
if( thread_am_ochief( thread ) ){
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ) );
bli_obj_release_pack( c_pack );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( b1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,38 +37,50 @@
void bli_trmm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b1, b1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
obj_t a1, b1;
obj_t* a1_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c_pack );
if( thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
// Initialize pack objects for A and B that are passed into packm_init().
if( thread_am_ichief( thread ) ){
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -83,38 +95,51 @@ void bli_trmm_blk_var3f( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
trmm_thread_sub_ipackm( thread ) );
// Packing must be done before computation
thread_ibarrier( thread );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a1_pack,
&b1_pack,
a1_pack,
b1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_trmm( cntl ) );
c_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
}
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
thread_obarrier( thread );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c_pack );
// Unpack C (if C was packed).
if( thread_am_ochief( thread ) ){
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ) );
bli_obj_release_pack( c_pack );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( b1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -125,12 +125,20 @@ void bli_trmm_front( side_t side,
if ( bli_is_left( side ) ) cntl = l_cntl;
else cntl = r_cntl;
// Invoke the internal back-end.
bli_trmm_int( alpha,
&a_local,
&b_local,
&BLIS_ZERO,
&c_local,
cntl );
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trmm_int,
alpha,
&a_local,
&b_local,
&BLIS_ZERO,
&c_local,
(void*) cntl,
(void**) infos );
bli_trmm_thrinfo_free_paths( infos );
}

View File

@@ -39,7 +39,8 @@
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
static FUNCPTR_T vars[2][2][4][3] =
{
@@ -88,7 +89,8 @@ void bli_trmm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a_local;
obj_t b_local;
@@ -173,6 +175,7 @@ void bli_trmm_int( obj_t* alpha,
f( &a_local,
&b_local,
&c_local,
cntl );
cntl,
thread );
}

View File

@@ -37,4 +37,5 @@ void bli_trmm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
void bli_trmm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_ll_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -270,9 +274,12 @@ void PASTEMAC(ch,varname)( \
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
for ( j = 0; j < n_iter; ++j ) { \
\
if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
@@ -307,121 +314,124 @@ void PASTEMAC(ch,varname)( \
off_a1011 = 0; \
k_a1011 = diagoffa_i + MR; \
\
b1_i = b1 + off_a1011 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1011 * ss_a; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
{ \
b1_i = b1 + off_a1011 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1011 * ss_a; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
a1 += k_a1011 * ss_a; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( rstep_a, aux ); \
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( rstep_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
} \
b1 += cstep_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 )

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
void bli_trmm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_lu_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -277,6 +281,8 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -294,7 +300,7 @@ void PASTEMAC(ch,varname)( \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
for ( i = 0; i < m_iter; ++i ) if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
@@ -315,6 +321,7 @@ void PASTEMAC(ch,varname)( \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
b1_i = b1 + off_a1112 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
@@ -369,11 +376,12 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += k_a1112 * ss_a; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
@@ -423,13 +431,13 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
} \
b1 += cstep_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 )

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
void bli_trmm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_rl_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -278,6 +282,7 @@ void PASTEMAC(ch,varname)( \
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a1_i; \
ctype* restrict a2; \
\
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
@@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
} \
b1 += k_b1121 * ss_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 )

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
void bli_trmm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_ru_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -279,6 +283,7 @@ void PASTEMAC(ch,varname)( \
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a1_i; \
ctype* restrict a2; \
\
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
@@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
} \
b1 += k_b0111 * ss_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 )

View File

@@ -0,0 +1,173 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "assert.h"
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm )
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->opackm = opackm;
thread->ipackm = ipackm;
thread->sub_trmm = sub_trmm;
}
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
thread->sub_trmm = thread;
}
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm )
{
trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) );
bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
opackm,
ipackm,
sub_trmm );
return thread;
}
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads )
{
}
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( )
{
dim_t jc_way = read_env( "BLIS_JC_NT" );
dim_t kc_way = read_env( "BLIS_KC_NT" );
dim_t ic_way = read_env( "BLIS_IC_NT" );
dim_t jr_way = read_env( "BLIS_JR_NT" );
dim_t ir_way = read_env( "BLIS_IR_NT" );
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
dim_t kc_nt = ic_way * jr_way * ir_way;
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
trmm_thrinfo_t** paths = (trmm_thrinfo_t**) malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
for( int e = 0; e < ir_way; e++)
{
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL, NULL, NULL);
trmm_thrinfo_t* jr_info = bli_create_trmm_thrinfo_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
paths[global_comm_id] = jc_info;
}
}
}
}
}
return paths;
}

View File

@@ -0,0 +1,79 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct trmm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct trmm_thrinfo_s* sub_trmm;
};
typedef struct trmm_thrinfo_s trmm_thrinfo_t;
#define trmm_thread_sub_trmm( thread ) thread->sub_trmm
#define trmm_thread_sub_opackm( thread ) thread->opackm
#define trmm_thread_sub_ipackm( thread ) thread->ipackm
#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( );
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** );
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm );
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm );
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread );

View File

@@ -127,12 +127,20 @@ void bli_trmm3_front( side_t side,
if ( bli_is_left( side ) ) cntl = l_cntl;
else cntl = r_cntl;
// Invoke the internal back-end.
bli_trmm_int( alpha,
&a_local,
&b_local,
beta,
&c_local,
cntl );
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trmm_int,
alpha,
&a_local,
&b_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_trmm_thrinfo_free_paths( infos );
}

View File

@@ -216,17 +216,18 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels )
return info_paths;
}
*/
void bli_get_range( void* thr, dim_t size, dim_t block_factor, dim_t* start, dim_t* end )
void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
{
thrinfo_t* thread = (thrinfo_t*) thr;
dim_t n_way = thread->n_way;
dim_t work_id = thread->work_id;
dim_t size = all_end - all_start;
dim_t n_pt = size / n_way;
n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor);
*start = work_id * n_pt;
*end = bli_min( *start + n_pt, size );
*start = work_id * n_pt + all_start;
*end = bli_min( *start + n_pt, size + all_start );
}
void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)

View File

@@ -87,7 +87,8 @@ typedef struct thrinfo_s thrinfo_t;
#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id )
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
void bli_get_range( void* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end );
void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
@@ -98,6 +99,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm
#include "bli_packm_threading.h"
#include "bli_gemm_threading.h"
#include "bli_herk_threading.h"
#include "bli_trmm_threading.h"
typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread );
void bli_level3_thread_decorator( dim_t num_threads,