diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index e521b769f..66f2ce70b 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -66,7 +66,7 @@ void bli_gemm_blk_var2f( obj_t* a, } a_pack = thread_obroadcast( thread, &a_pack_s ); - // Initialize all pack objects that are passed into packm_init(). + // Initialize pack objects for B and C that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); @@ -129,7 +129,7 @@ void bli_gemm_blk_var2f( obj_t* a, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), - gemm_thread_sub_gemm( thread) ); + gemm_thread_sub_gemm( thread ) ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index 8af9837a0..f0647ccb3 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -80,17 +80,15 @@ void bli_gemm_blk_var3f( obj_t* a, // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - dim_t start, end; - bli_get_range( thread, k_trans, 1, &start, &end ); // Partition along the k dimension. - for ( i = start; i < end; i += b_alg ) + for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of b (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, end, b, + b_alg = bli_determine_blocksize_f( i, k_trans, b, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and B1. @@ -140,18 +138,20 @@ void bli_gemm_blk_var3f( obj_t* a, } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ){ + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - thread_obarrier( thread ); if( thread_am_ichief( thread ) ){ bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); } - if( thread_am_ochief( thread ) ) - bli_obj_release_pack( c_pack ); } diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 753182a8f..fd6f92c14 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -60,11 +60,6 @@ gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos() return bli_create_gemm_thrinfo_paths( ); } -void bli_gemm_cntl_free_thrinfos(thrinfo_t* tofree) -{ - //MEMORYLEAK -} - void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 1c26681af..88bc32d9a 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -91,6 +91,6 @@ void bli_gemm_front( obj_t* alpha, &infos[omp_id] ); } - bli_gemm_cntl_free_thrinfos( infos ); + bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 7a71e8f31..7d0734e40 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -218,13 +218,13 @@ void PASTEMAC(ch,varname)( \ bli_auxinfo_set_ps_b( ps_b, aux ); \ \ gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \ - dim_t l2_num_threads = thread_n_way( thread ); \ - dim_t l2_thread_id = thread_work_id( thread ); \ - dim_t l1_num_threads = thread_n_way( caucus ); \ - dim_t l1_thread_id = thread_work_id( caucus ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = l2_thread_id; j < n_iter; j += l2_num_threads ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -239,7 +239,7 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = l1_thread_id; i < m_iter; i += l1_num_threads ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ \ diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 15c3aa84b..b0d28c8c5 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -95,6 +95,10 @@ dim_t read_env( char* env ) return number; } +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t* threads ) +{ +} + gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ) { dim_t jc_way = read_env( "BLIS_JC_NT" ); diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h index d046608da..280ba96ad 100644 --- a/frame/3/gemm/bli_gemm_threading.h +++ b/frame/3/gemm/bli_gemm_threading.h @@ -54,6 +54,7 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t; #define gemm_thread_sub_ipackm( thread ) thread->ipackm gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ); +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t* ); void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 4613857b8..fde8f9f70 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -98,6 +98,6 @@ void bli_hemm_front( side_t side, &infos[omp_id] ); } - bli_gemm_cntl_free_thrinfos( infos ); + bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index bfcd076cf..1097c338c 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -115,14 +115,16 @@ void bli_her2k_front( obj_t* alpha, &bh_local, beta, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); bli_herk_int( &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); #endif } diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index fcb8afbb7..88671b99f 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -37,43 +37,58 @@ void bli_herk_blk_var1f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t ah_pack; - obj_t c1, c1_pack; + obj_t ah_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack; + obj_t* c1_pack; + obj_t* ah_pack; dim_t i; dim_t b_alg; dim_t m_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &ah_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A'. + bli_obj_init_pack( &ah_pack_s ); + bli_packm_init( ah, &ah_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + ah_pack = thread_obroadcast( thread, &ah_pack_s ); + + // Initialize pack objects that are passed into packm_init() for A and C. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A' (if instructed). + bli_packm_int( ah, ah_pack, + cntl_sub_packm_b( cntl ), + herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A'. - bli_packm_init( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack A' (if instructed). - bli_packm_int( ah, &ah_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, m_trans, 8, &start, &end ); // Partition along the m dimension. - for ( i = 0; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -83,38 +98,53 @@ void bli_herk_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, - &a1_pack, - &ah_pack, + a1_pack, + ah_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_herk( cntl ) ); + c1_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &ah_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( ah_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var1f.h b/frame/3/herk/bli_herk_blk_var1f.h index dfcae5c99..2a1b85f6e 100644 --- a/frame/3/herk/bli_herk_blk_var1f.h +++ b/frame/3/herk/bli_herk_blk_var1f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var1f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index e09b48810..a92888288 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -37,50 +37,66 @@ void bli_herk_blk_var2f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a_pack, aS_pack; - obj_t ah1, ah1_pack; - obj_t c1; - obj_t c1S, c1S_pack; + obj_t a_pack_s; + obj_t ah1_pack_s, c1S_pack_s; + + obj_t ah1, c1, c1S; + obj_t aS_pack; + obj_t* a_pack; + obj_t* ah1_pack; + obj_t* c1S_pack; dim_t i; dim_t b_alg; dim_t n_trans; subpart_t stored_part; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &ah1_pack ); - bli_obj_init_pack( &c1S_pack ); - // The upper and lower variants are identical, except for which // merged subpartition is acquired in the loop body. if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B; else stored_part = BLIS_SUBPART1T; - // Query dimension in partitioning direction. - n_trans = bli_obj_width_after_trans( *c ); + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + // Initialize pack objects for C and A' that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &ah1_pack_s ); + bli_obj_init_pack( &c1S_pack_s ); + } + ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); + c1S_pack = thread_ibroadcast( thread, &c1S_pack_s ); // Pack A (if instructed). bli_packm_int( a, &a_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + hemm_thread_sub_opackm( thread ) ); + + // Query dimension in partitioning direction. + n_trans = bli_obj_width_after_trans( *c ); + dim_t start, end; + + // Needs to be replaced with a weighted range because triangle + bli_get_range( thread, n_trans, 8, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, n_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1' and C1. @@ -91,44 +107,55 @@ void bli_herk_blk_var2f( obj_t* a, // Partition off the stored region of C1 and the corresponding region // of A_pack. - bli_acquire_mpart_t2b( stored_part, - i, b_alg, &c1, &c1S ); - bli_acquire_mpart_t2b( stored_part, - i, b_alg, &a_pack, &aS_pack ); + bli_acquire_mpart_t2b( stored_part, + i, b_alg, &c1, &c1S ); + bli_acquire_mpart_t2b( stored_part, + i, b_alg, a_pack, &aS_pack ); // Initialize objects for packing A1' and C1. - bli_packm_init( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1S, &c1S_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1S, c1S_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ) ; // Pack A1' (if instructed). - bli_packm_int( &ah1, &ah1_pack, + bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1S, &c1S_pack, + bli_packm_int( &c1S, c1S_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ) ; // Perform herk subproblem. bli_herk_int( &BLIS_ONE, &aS_pack, - &ah1_pack, + ah1_pack, &BLIS_ONE, - &c1S_pack, - cntl_sub_herk( cntl ) ); + c1S_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1S_pack, &c1S, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1S_pack, &c1S, + cntl_sub_unpackm_c( cntl ) ); + } + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &ah1_pack ); - bli_obj_release_pack( &c1S_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( ah1_pack ); + bli_obj_release_pack( c1S_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var2f.h b/frame/3/herk/bli_herk_blk_var2f.h index 4932535d1..1d405f214 100644 --- a/frame/3/herk/bli_herk_blk_var2f.h +++ b/frame/3/herk/bli_herk_blk_var2f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var2f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index cb3b323e2..61ca8c7de 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -37,38 +37,50 @@ void bli_herk_blk_var3f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t ah1, ah1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, ah1_pack_s; + + obj_t a1, ah1; + obj_t* a1_pack = NULL; + obj_t* ah1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; + if( thread_am_ochief( thread ) ) { + // Initialize object for packing C. + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &ah1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &ah1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,28 +95,22 @@ void bli_herk_blk_var3f( obj_t* a, i, b_alg, ah, &ah1 ); // Initialize objects for packing A1 and A1'. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ) ); + } // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &ah1, &ah1_pack, + bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - &a1_pack, - &ah1_pack, - &BLIS_ONE, - &c_pack, - cntl_sub_herk( cntl ) ); + herk_thread_sub_ipackm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it @@ -112,17 +118,36 @@ void bli_herk_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. - if ( i == 0 ) bli_obj_scalar_reset( &c_pack ); + if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); + + // Packing must be done before computation + thread_ibarrier( thread ); + + // Perform herk subproblem. + bli_herk_int( &BLIS_ONE, + a1_pack, + ah1_pack, + &BLIS_ONE, + c_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); + } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ) { + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &ah1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( ah1_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var3f.h b/frame/3/herk/bli_herk_blk_var3f.h index b77ebc33f..22093d421 100644 --- a/frame/3/herk/bli_herk_blk_var3f.h +++ b/frame/3/herk/bli_herk_blk_var3f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var3f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 456cf84a8..ff6a18252 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -77,12 +77,24 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } + herk_thrinfo_t* infos = bli_herk_cntl_get_thrinfos(); + dim_t n_threads = thread_num_threads( (&infos[0]) ); + // Invoke the internal back-end. - bli_herk_int( alpha, - &a_local, - &ah_local, - beta, - &c_local, - cntl ); + _Pragma( "omp parallel num_threads(n_threads)" ) + { + dim_t omp_id = omp_get_thread_num(); + + + bli_herk_int( alpha, + &a_local, + &ah_local, + beta, + &c_local, + cntl, + &infos[omp_id] ); + } + + bli_herk_thrinfo_free_paths( infos ); } diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index bc6a1fa5f..64fd7b1c4 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); static FUNCPTR_T vars[2][4][3] = { @@ -66,7 +67,8 @@ void bli_herk_int( obj_t* alpha, obj_t* ah, obj_t* beta, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { obj_t a_local; obj_t ah_local; @@ -138,6 +140,7 @@ void bli_herk_int( obj_t* alpha, f( &a_local, &ah_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/herk/bli_herk_int.h b/frame/3/herk/bli_herk_int.h index 1b1973b3e..a3fa6343d 100644 --- a/frame/3/herk/bli_herk_int.h +++ b/frame/3/herk/bli_herk_int.h @@ -37,5 +37,6 @@ void bli_herk_int( obj_t* alpha, obj_t* ah, obj_t* beta, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 8afcf5124..c4d46718b 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + herk_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -121,7 +123,8 @@ void bli_herk_l_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ @@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ } diff --git a/frame/3/herk/bli_herk_l_ker_var2.h b/frame/3/herk/bli_herk_l_ker_var2.h index 5dd906db9..09f1c7b31 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.h +++ b/frame/3/herk/bli_herk_l_ker_var2.h @@ -39,7 +39,8 @@ void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( herk_l_ker_var2 ) diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c new file mode 100644 index 000000000..ca652f196 --- /dev/null +++ b/frame/3/herk/bli_herk_threading.c @@ -0,0 +1,184 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_herk = sub_herk; +} + +void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_herk = thread; +} + +herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ) +{ + herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc( sizeof( herk_thrinfo_t ) ); + bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_herk ); + return thread; +} + +dim_t read_env( char* env ) +{ + dim_t number = 1; + char* str = getenv( env ); + if( str != NULL ) + { + number = strtol( str, NULL, 10 ); + } + return number; +} + +void bli_herk_thrinfo_free_paths( herk_thrinfo_t* threads ) +{ +} + +herk_thrinfo_t* bli_create_herk_thrinfo_paths( ) +{ + dim_t jc_way = read_env( "BLIS_JC_NT" ); + dim_t kc_way = read_env( "BLIS_KC_NT" ); + dim_t ic_way = read_env( "BLIS_IC_NT" ); + dim_t jr_way = read_env( "BLIS_JR_NT" ); + dim_t ir_way = read_env( "BLIS_IR_NT" ); + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + herk_thrinfo_t* paths = (herk_thrinfo_t*) malloc( global_num_threads * sizeof( herk_thrinfo_t ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + herk_thrinfo_t* jc_info = &paths[global_comm_id]; + bli_setup_herk_thrinfo_node( jc_info, global_comm, global_comm_id, + jr_comm, jr_comm_id, + jr_way, a, + NULL, NULL, kc_info); + } + } + } + } + } + return paths; +} diff --git a/frame/3/herk/bli_herk_threading.h b/frame/3/herk/bli_herk_threading.h new file mode 100644 index 000000000..f0e206cc7 --- /dev/null +++ b/frame/3/herk/bli_herk_threading.h @@ -0,0 +1,74 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct herk_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct herk_thrinfo_s* sub_herk; +}; +typedef struct herk_thrinfo_s herk_thrinfo_t; + +#define herk_thread_sub_herk( thread ) thread->sub_herk +#define herk_thread_sub_opackm( thread ) thread->opackm +#define herk_thread_sub_ipackm( thread ) thread->ipackm + +herk_thrinfo_t* bli_herk_create_thrinfo_paths( ); +void bli_herk_thrinfo_free_paths(); + +void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ); + +herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ); + +void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 9c3d6cf06..573738c0f 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + herk_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); void bli_herk_u_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -121,7 +123,8 @@ void bli_herk_u_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ @@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ } diff --git a/frame/3/herk/bli_herk_u_ker_var2.h b/frame/3/herk/bli_herk_u_ker_var2.h index c6555bc27..481947b8e 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.h +++ b/frame/3/herk/bli_herk_u_ker_var2.h @@ -39,7 +39,8 @@ void bli_herk_u_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( herk_u_ker_var2 ) diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index abc7930a3..99c628c88 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -97,6 +97,6 @@ void bli_symm_front( side_t side, &infos[omp_id] ); } - bli_gemm_cntl_free_thrinfos( infos ); + bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 4fa89654b..ab2d0d700 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -98,14 +98,16 @@ void bli_syr2k_front( obj_t* alpha, &bt_local, beta, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); bli_herk_int( alpha, &b_local, &at_local, &BLIS_ONE, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); #endif } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index cc2f8d15a..9022c9442 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -79,6 +79,7 @@ void bli_syrk_front( obj_t* alpha, &at_local, beta, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); } diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 116ccd971..80eadd8e2 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -48,6 +48,7 @@ obj_t BLIS_MINUS_TWO; packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; +herk_thrinfo_t BLIS_HERK_SINGLE_THREADED; thread_comm_t BLIS_SINGLE_COMM; void bli_init( void ) @@ -65,6 +66,7 @@ void bli_init( void ) bli_setup_communicator( &BLIS_SINGLE_COMM, 1 ); bli_setup_packm_single_threaded_info( &BLIS_PACKM_SINGLE_THREADED ); bli_setup_gemm_single_threaded_info( &BLIS_GEMM_SINGLE_THREADED ); + bli_setup_herk_single_threaded_info( &BLIS_HERK_SINGLE_THREADED ); } void bli_finalize( void ) diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index 9fbfcf21b..b944457b5 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -97,5 +97,6 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm #include "bli_packm_threading.h" #include "bli_gemm_threading.h" +#include "bli_herk_threading.h" #endif diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index ad58e7192..7b2a2dfd4 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -46,5 +46,6 @@ extern obj_t BLIS_MINUS_TWO; extern thread_comm_t BLIS_SINGLE_COMM; extern packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; extern gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; +extern herk_thrinfo_t BLIS_HERK_SINGLE_THREADED; #endif