Fixed bug with parallel packing, and bug with allocating an array of thread infos

In packm variant 1, the variable p_begin was incremented each iteration, causing a dependency.
This dependeny was removed, allowing each iteration to be executed in parallel.

Somewhere in bli_threading.c, I was allocating an array of pointers instead of an array of structs.
This commit is contained in:
Tyler Smith
2014-02-27 15:53:10 -06:00
parent 6193d9ceea
commit bfe214b633
2 changed files with 6 additions and 8 deletions

View File

@@ -187,9 +187,6 @@ void PASTEMAC(ch,varname )( \
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
dim_t t_id = thread_id( thread ); \
dim_t num_threads = thread_num_threads( thread ); \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
@@ -266,6 +263,9 @@ void PASTEMAC(ch,varname )( \
} \
\
p_begin = p_cast; \
dim_t t_id = thread_id( thread ); \
dim_t num_threads = thread_num_threads( thread ); \
\
\
for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \
ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \
@@ -274,6 +274,7 @@ void PASTEMAC(ch,varname )( \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
p_begin = p_cast + (ip )*ps_p; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
@@ -389,9 +390,6 @@ void PASTEMAC(ch,varname )( \
BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \
p_inc = ldp * panel_len_max_i; \
} \
\
\
p_begin += p_inc; \
} \
\
\

View File

@@ -181,7 +181,7 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels )
//Create communicators
thread_comm_tree_t* comm_leaves = (thread_comm_tree_t*)bli_malloc( sizeof(thread_comm_tree_t) * n_threads);
create_comms( caucuses_at_level, n_levels, 0, NULL, comm_leaves, 0 );
thrinfo_t* info_paths = (thrinfo_t*)bli_malloc( sizeof(thrinfo_t*) * n_threads );
thrinfo_t* info_paths = (thrinfo_t*)bli_malloc( sizeof(thrinfo_t) * n_threads );
//Now create paths upwards
for( dim_t i = 0; i < n_threads; i++ )
@@ -209,7 +209,7 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels )
bli_setup_thrinfo_t(cur, comm_node->comm, ocomm_id,
prev, caucuses_at_level[n_levels - j - 1], caucus_id );
cur = prev;
prev = cur;
comm_node = comm_node->parent;
}
}