mirror of
https://github.com/amd/blis.git
synced 2026-05-14 03:02:08 +00:00
Parallelized trmm and trmm3
Also fixed bugs in packm
This commit is contained in:
@@ -263,18 +263,14 @@ void PASTEMAC(ch,varname )( \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
dim_t t_id = thread_id( thread ); \
|
||||
dim_t num_threads = thread_num_threads( thread ); \
|
||||
p_inc = ps_p; \
|
||||
\
|
||||
for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \
|
||||
ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
p_begin = p_cast + (ip )*p_inc; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
@@ -323,6 +319,8 @@ void PASTEMAC(ch,varname )( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -336,6 +334,7 @@ void PASTEMAC(ch,varname )( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
}\
|
||||
\
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -349,6 +348,8 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -360,6 +361,7 @@ void PASTEMAC(ch,varname )( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -373,6 +375,8 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -384,10 +388,13 @@ void PASTEMAC(ch,varname )( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
} \
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
|
||||
@@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
dim_t t_id = thread_id( thread ); \
|
||||
dim_t num_threads = thread_num_threads( thread ); \
|
||||
p_inc = ps_p; \
|
||||
\
|
||||
for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \
|
||||
ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
p_begin = p_cast + (ip )*p_inc; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
@@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
@@ -399,6 +398,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -411,6 +412,7 @@ void PASTEMAC(ch,varname)( \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
} \
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
} \
|
||||
@@ -423,6 +425,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -434,6 +438,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
@@ -448,6 +453,8 @@ void PASTEMAC(ch,varname)( \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
dim_t t_id = thread_id( thread ); \
|
||||
dim_t num_threads = thread_num_threads( thread ); \
|
||||
p_inc = ps_p; \
|
||||
\
|
||||
for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \
|
||||
ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
p_begin = p_cast + (ip )*p_inc; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
@@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
@@ -406,6 +405,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -417,6 +418,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -430,6 +432,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -441,6 +445,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -463,6 +468,8 @@ void PASTEMAC(ch,varname)( \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -44,6 +44,8 @@ struct packm_thrinfo_s //implements thrinfo_t
|
||||
};
|
||||
typedef struct packm_thrinfo_s packm_thrinfo_t;
|
||||
|
||||
#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
|
||||
@@ -83,7 +83,7 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, m_trans, 8, &start, &end );
|
||||
bli_get_range( thread, 0, m_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
|
||||
@@ -82,7 +82,7 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, n_trans, 8, &start, &end );
|
||||
bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
|
||||
@@ -85,7 +85,7 @@ void bli_hemm_front( side_t side,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_gemm_int,
|
||||
(level3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
@@ -116,7 +116,7 @@ void bli_her2k_front( obj_t* alpha,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_herk_int,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&bh_local,
|
||||
@@ -126,7 +126,7 @@ void bli_her2k_front( obj_t* alpha,
|
||||
(void**) infos );
|
||||
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_herk_int,
|
||||
(level3_int_t) bli_herk_int,
|
||||
&alpha_conj,
|
||||
&b_local,
|
||||
&ah_local,
|
||||
|
||||
@@ -82,7 +82,7 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bli_obj_length_after_trans( *c );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, m_trans, 8, &start, &end );
|
||||
bli_get_range( thread, 0, m_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
|
||||
@@ -90,7 +90,7 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
dim_t start, end;
|
||||
|
||||
// Needs to be replaced with a weighted range because triangle
|
||||
bli_get_range( thread, n_trans, 8, &start, &end );
|
||||
bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
|
||||
@@ -82,7 +82,7 @@ void bli_herk_front( obj_t* alpha,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_herk_int,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&ah_local,
|
||||
|
||||
@@ -84,7 +84,7 @@ void bli_symm_front( side_t side,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_gemm_int,
|
||||
(level3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
@@ -98,7 +98,7 @@ void bli_syr2k_front( obj_t* alpha,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_herk_int,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&bt_local,
|
||||
@@ -108,7 +108,7 @@ void bli_syr2k_front( obj_t* alpha,
|
||||
(void**) infos );
|
||||
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_herk_int,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&b_local,
|
||||
&at_local,
|
||||
|
||||
@@ -78,7 +78,7 @@ void bli_syrk_front( obj_t* alpha,
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t*) bli_herk_int,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&at_local,
|
||||
|
||||
@@ -37,21 +37,48 @@
|
||||
void bli_trmm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t b_pack_s;
|
||||
obj_t a1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t a1, c1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offA;
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
@@ -66,25 +93,14 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
|
||||
bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing B.
|
||||
bli_packm_init( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, offA, m_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = offA; i < m_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, m_trans, a,
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
@@ -94,38 +110,55 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be finished before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b_pack,
|
||||
a1_pack,
|
||||
b_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Currently must be done by 1 thread
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
//Barrier to make sure unpacking is done before next iteration's packing of C
|
||||
//Somehow, we'd like to make this a noop if packing isn't done.
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( b_pack );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,43 +37,58 @@
|
||||
void bli_trmm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread)
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t b1, c1;
|
||||
obj_t* a_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, n_trans, b,
|
||||
b_alg = bli_determine_blocksize_b( i, end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
@@ -83,38 +98,55 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be finished before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
a_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
//Barrier to make sure unpacking is done before next iteration's packing of C
|
||||
//Somehow, we'd like to make this a noop if packing isn't done.
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( b1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,43 +37,58 @@
|
||||
void bli_trmm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread)
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t b1, c1;
|
||||
obj_t* a_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, 0, n_trans, 8, &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, n_trans, b,
|
||||
b_alg = bli_determine_blocksize_f( i, end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
@@ -83,38 +98,55 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be finished before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
a_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
}
|
||||
//Barrier to make sure unpacking is done before next iteration's packing of C
|
||||
//Somehow, we'd like to make this a noop if packing isn't done.
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( b1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,38 +37,50 @@
|
||||
void bli_trmm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
|
||||
obj_t a1, b1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
if( thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize pack objects for A and B that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -83,38 +95,51 @@ void bli_trmm_blk_var3b( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be done before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b1_pack,
|
||||
a1_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
}
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
thread_obarrier( thread );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
// Unpack C (if C was packed).
|
||||
if( thread_am_ochief( thread ) ){
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_obj_release_pack( c_pack );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( b1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,38 +37,50 @@
|
||||
void bli_trmm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
|
||||
obj_t a1, b1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
if( thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize pack objects for A and B that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -83,38 +95,51 @@ void bli_trmm_blk_var3f( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Packing must be done before computation
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b1_pack,
|
||||
a1_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
}
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
thread_obarrier( thread );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
// Unpack C (if C was packed).
|
||||
if( thread_am_ochief( thread ) ){
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_obj_release_pack( c_pack );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( b1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -125,12 +125,20 @@ void bli_trmm_front( side_t side,
|
||||
if ( bli_is_left( side ) ) cntl = l_cntl;
|
||||
else cntl = r_cntl;
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_trmm_int( alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
&BLIS_ZERO,
|
||||
&c_local,
|
||||
cntl );
|
||||
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_trmm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
&BLIS_ZERO,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_trmm_thrinfo_free_paths( infos );
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[2][2][4][3] =
|
||||
{
|
||||
@@ -88,7 +89,8 @@ void bli_trmm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
@@ -173,6 +175,7 @@ void bli_trmm_int( obj_t* alpha,
|
||||
f( &a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
cntl,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -37,4 +37,5 @@ void bli_trmm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
|
||||
void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -270,9 +274,12 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
for ( j = 0; j < n_iter; ++j ) { \
|
||||
\
|
||||
if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
@@ -307,121 +314,124 @@ void PASTEMAC(ch,varname)( \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = diagoffa_i + MR; \
|
||||
\
|
||||
b1_i = b1 + off_a1011 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1011 * ss_a; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
\
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
|
||||
{ \
|
||||
b1_i = b1 + off_a1011 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1011 * ss_a; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
\
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
a1 += k_a1011 * ss_a; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
\
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( rstep_a, aux ); \
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( rstep_a, aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 )
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
|
||||
void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -277,6 +281,8 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -294,7 +300,7 @@ void PASTEMAC(ch,varname)( \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
for ( i = 0; i < m_iter; ++i ) if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
@@ -315,6 +321,7 @@ void PASTEMAC(ch,varname)( \
|
||||
off_a1112 = diagoffa_i; \
|
||||
k_a1112 = k - off_a1112; \
|
||||
\
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
b1_i = b1 + off_a1112 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
@@ -369,11 +376,12 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += k_a1112 * ss_a; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
@@ -423,13 +431,13 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 )
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
|
||||
void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -278,6 +282,7 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
|
||||
in A. Then compute the length of that panel. */ \
|
||||
off_b1121 = bli_max( -diagoffb_j, 0 ); \
|
||||
k_b1121 = k - off_b1121; \
|
||||
\
|
||||
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
@@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += k_b1121 * ss_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 )
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
|
||||
void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -279,6 +283,7 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
|
||||
so we can index into the corresponding location in A. */ \
|
||||
off_b0111 = 0; \
|
||||
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
|
||||
\
|
||||
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
@@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += k_b0111 * ss_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 )
|
||||
|
||||
173
frame/3/trmm/bli_trmm_threading.c
Normal file
173
frame/3/trmm/bli_trmm_threading.c
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm )
|
||||
{
|
||||
thread->ocomm = ocomm;
|
||||
thread->ocomm_id = ocomm_id;
|
||||
thread->icomm = icomm;
|
||||
thread->icomm_id = icomm_id;
|
||||
thread->n_way = n_way;
|
||||
thread->work_id = work_id;
|
||||
thread->opackm = opackm;
|
||||
thread->ipackm = ipackm;
|
||||
thread->sub_trmm = sub_trmm;
|
||||
}
|
||||
|
||||
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->sub_trmm = thread;
|
||||
}
|
||||
|
||||
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm )
|
||||
{
|
||||
trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) );
|
||||
bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_trmm );
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads )
|
||||
{
|
||||
}
|
||||
|
||||
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( )
|
||||
{
|
||||
dim_t jc_way = read_env( "BLIS_JC_NT" );
|
||||
dim_t kc_way = read_env( "BLIS_KC_NT" );
|
||||
dim_t ic_way = read_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = read_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = read_env( "BLIS_IR_NT" );
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
|
||||
dim_t kc_nt = ic_way * jr_way * ir_way;
|
||||
dim_t ic_nt = jr_way * ir_way;
|
||||
dim_t jr_nt = ir_way;
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
trmm_thrinfo_t** paths = (trmm_thrinfo_t**) malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) );
|
||||
|
||||
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++)
|
||||
{
|
||||
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
trmm_thrinfo_t* jr_info = bli_create_trmm_thrinfo_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
|
||||
trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
|
||||
trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
79
frame/3/trmm/bli_trmm_threading.h
Normal file
79
frame/3/trmm/bli_trmm_threading.h
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
struct trmm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct trmm_thrinfo_s* sub_trmm;
|
||||
};
|
||||
typedef struct trmm_thrinfo_s trmm_thrinfo_t;
|
||||
|
||||
#define trmm_thread_sub_trmm( thread ) thread->sub_trmm
|
||||
#define trmm_thread_sub_opackm( thread ) thread->opackm
|
||||
#define trmm_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( );
|
||||
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** );
|
||||
|
||||
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm );
|
||||
|
||||
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm );
|
||||
|
||||
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread );
|
||||
@@ -127,12 +127,20 @@ void bli_trmm3_front( side_t side,
|
||||
if ( bli_is_left( side ) ) cntl = l_cntl;
|
||||
else cntl = r_cntl;
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_trmm_int( alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_trmm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_trmm_thrinfo_free_paths( infos );
|
||||
}
|
||||
|
||||
|
||||
@@ -216,17 +216,18 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels )
|
||||
return info_paths;
|
||||
}
|
||||
*/
|
||||
void bli_get_range( void* thr, dim_t size, dim_t block_factor, dim_t* start, dim_t* end )
|
||||
void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
|
||||
{
|
||||
thrinfo_t* thread = (thrinfo_t*) thr;
|
||||
|
||||
dim_t n_way = thread->n_way;
|
||||
dim_t work_id = thread->work_id;
|
||||
|
||||
dim_t size = all_end - all_start;
|
||||
dim_t n_pt = size / n_way;
|
||||
n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
|
||||
n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor);
|
||||
*start = work_id * n_pt;
|
||||
*end = bli_min( *start + n_pt, size );
|
||||
*start = work_id * n_pt + all_start;
|
||||
*end = bli_min( *start + n_pt, size + all_start );
|
||||
}
|
||||
|
||||
void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
|
||||
|
||||
@@ -87,7 +87,8 @@ typedef struct thrinfo_s thrinfo_t;
|
||||
#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id )
|
||||
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
|
||||
|
||||
void bli_get_range( void* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end );
|
||||
void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
|
||||
|
||||
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
@@ -98,6 +99,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm
|
||||
#include "bli_packm_threading.h"
|
||||
#include "bli_gemm_threading.h"
|
||||
#include "bli_herk_threading.h"
|
||||
#include "bli_trmm_threading.h"
|
||||
|
||||
typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread );
|
||||
void bli_level3_thread_decorator( dim_t num_threads,
|
||||
|
||||
Reference in New Issue
Block a user