diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index eb828543b..f7e60e406 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + packm_thrinfo_t* thread ); //static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3); void bli_packm_blk_var3( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -154,7 +156,8 @@ void bli_packm_blk_var3( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } @@ -177,7 +180,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ @@ -296,14 +300,18 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ + dim_t t_id = thread_id( thread ); \ + dim_t num_threads = thread_num_threads( thread ); \ + p_inc = ps_p; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ + ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ + p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -437,8 +445,6 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ -\ - p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var3.h b/frame/1m/packm/bli_packm_blk_var3.h index 6189d2415..b1d684262 100644 --- a/frame/1m/packm/bli_packm_blk_var3.h +++ b/frame/1m/packm/bli_packm_blk_var3.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var3( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); #undef GENTPROTCO @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ); INSERT_GENTPROTCO_BASIC( packm_blk_var3 ) diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index dd1cedfc8..d8721df75 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + packm_thrinfo_t* thread ); //static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4); void bli_packm_blk_var4( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -154,7 +156,8 @@ void bli_packm_blk_var4( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } @@ -177,7 +180,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ @@ -296,14 +300,18 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ + dim_t t_id = thread_id( thread ); \ + dim_t num_threads = thread_num_threads( thread ); \ + p_inc = ps_p; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ + ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ + p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -452,8 +460,6 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ -\ - p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var4.h b/frame/1m/packm/bli_packm_blk_var4.h index e13e5fe33..e727873e4 100644 --- a/frame/1m/packm/bli_packm_blk_var4.h +++ b/frame/1m/packm/bli_packm_blk_var4.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var4( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); #undef GENTPROTCO @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* t \ ); INSERT_GENTPROTCO_BASIC( packm_blk_var4 )