diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c index 44aba8ac7..b35d03f32 100644 --- a/frame/3/bli_l3_sup_int.c +++ b/frame/3/bli_l3_sup_int.c @@ -43,7 +43,6 @@ err_t bli_gemmsup_int obj_t* c, cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ) { @@ -89,45 +88,86 @@ err_t bli_gemmsup_int stor_id == BLIS_CRR ); const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; - const num_t dt = bli_obj_dt( c ); - const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + const num_t dt = bli_obj_dt( c ); + const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + const bool_t auto_factor = bli_rntm_auto_factor( rntm ); + const dim_t n_threads = bli_rntm_num_threads( rntm ); + bool_t use_bp = TRUE; + dim_t jc_new; + dim_t ic_new; + + if ( is_primary ) { // This branch handles: // - rrr rrc rcr crr for row-preferential kernels // - rcc crc ccr ccc for column-preferential kernels - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t mu = m / MR; const dim_t nu = n / NR; - if ( mu >= nu ) - //if ( m % 2 == 1 && n % 2 == 1 ) + // Decide which algorithm to use (block-panel var2m or panel-block + // var1n) based on the number of micropanels in the m and n dimensions. + // Also, recalculate the automatic thread factorization. + if ( mu >= nu ) use_bp = TRUE; + else /* if ( mu < nu ) */ use_bp = FALSE; + + // If the parallel thread factorization was automatic, we update it + // with a new factorization based on the matrix dimensions in units + // of micropanels. + if ( auto_factor ) + { + if ( use_bp ) + { + // In the block-panel algorithm, the m dimension is parallelized + // with ic_nt and the n dimension is parallelized with jc_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); + } + else // if ( !use_bp ) + { + // In the panel-block algorithm, the m dimension is parallelized + // with jc_nt and the n dimension is parallelized with ic_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); + } + + // Update the ways of parallelism for the jc and ic loops, and then + // update the current thread's root thrinfo_t node according to the + // new ways of parallelism value for the jc loop. + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); + bli_l3_sup_thrinfo_update_root( rntm, thread ); + } + + + if ( use_bp ) { #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); #endif // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, cntl, thread ); + stor_id, cntx, rntm, thread ); } - else // if ( mu < nu ) + else // use_pb { #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var1n primary\n" ); #endif // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, cntl, thread ); + stor_id, cntx, rntm, thread ); + // *requires nudging of nc up to be a multiple of mr. } } else @@ -136,35 +176,64 @@ err_t bli_gemmsup_int // - rrr rrc rcr crr for column-preferential kernels // - rcc crc ccr ccc for row-preferential kernels - const dim_t mt = bli_obj_width( c ); - const dim_t nt = bli_obj_length( c ); - const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t mu = mt / MR; - const dim_t nu = nt / NR; + const dim_t mu = n / MR; // the n becomes m after a transposition + const dim_t nu = m / NR; // the m becomes n after a transposition - if ( mu >= nu ) - //if ( mt % 2 == 1 && nt % 2 == 1 ) + // Decide which algorithm to use (block-panel var2m or panel-block + // var1n) based on the number of micropanels in the m and n dimensions. + // Also, recalculate the automatic thread factorization. + if ( mu >= nu ) use_bp = TRUE; + else /* if ( mu < nu ) */ use_bp = FALSE; + + // If the parallel thread factorization was automatic, we update it + // with a new factorization based on the matrix dimensions in units + // of micropanels. + if ( auto_factor ) + { + if ( use_bp ) + { + // In the block-panel algorithm, the m dimension is parallelized + // with ic_nt and the n dimension is parallelized with jc_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); + } + else // if ( !use_bp ) + { + // In the panel-block algorithm, the m dimension is parallelized + // with jc_nt and the n dimension is parallelized with ic_nt. + bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); + } + + // Update the ways of parallelism for the jc and ic loops, and then + // update the current thread's root thrinfo_t node according to the + // new ways of parallelism value for the jc loop. + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); + bli_l3_sup_thrinfo_update_root( rntm, thread ); + } + + + if ( use_bp ) { #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var2m non-primary\n" ); #endif // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, cntl, thread ); + stor_id, cntx, rntm, thread ); } - else // if ( mu < nu ) + else // use_pb { #ifdef TRACEVAR + if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var1n non-primary\n" ); #endif // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans bli_gemmsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, - stor_id, cntx, rntm, cntl, thread ); + stor_id, cntx, rntm, thread ); + // *requires nudging of mc up to be a multiple of nr. } - // *requires nudging of mc,nc up to be a multiple of nr,mr. } // Return success so that the caller knows that we computed the solution. diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h index 418e905da..bdea544b2 100644 --- a/frame/3/bli_l3_sup_int.h +++ b/frame/3/bli_l3_sup_int.h @@ -41,6 +41,5 @@ err_t bli_gemmsup_int obj_t* c, cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h index 5d31c0aa7..c875ab5a8 100644 --- a/frame/3/bli_l3_sup_packm_a.h +++ b/frame/3/bli_l3_sup_packm_a.h @@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \ ( \ bool_t will_pack, \ packbuf_t pack_buf_type, \ - stor3_t stor_id, \ dim_t m, \ dim_t k, \ dim_t mr, \ @@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ @@ -97,8 +96,11 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) void PASTEMAC(ch,opname) \ ( \ bool_t will_pack, \ + packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ + dim_t m_alloc, \ + dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ @@ -107,6 +109,7 @@ void PASTEMAC(ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h index a934718e4..8ee2cc92c 100644 --- a/frame/3/bli_l3_sup_packm_b.h +++ b/frame/3/bli_l3_sup_packm_b.h @@ -40,7 +40,6 @@ void PASTEMAC(ch,opname) \ ( \ bool_t will_pack, \ packbuf_t pack_buf_type, \ - stor3_t stor_id, \ dim_t k, \ dim_t n, \ dim_t nr, \ @@ -80,7 +79,7 @@ void PASTEMAC(ch,opname) \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ @@ -97,16 +96,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) void PASTEMAC(ch,opname) \ ( \ bool_t will_pack, \ + packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ + dim_t k_alloc, \ + dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ - ctype* restrict x, inc_t rs_x, inc_t cs_x, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index f4d260fd8..b3593b839 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -327,3 +327,137 @@ bli_thread_obarrier( thread ); \ ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ } \ */ + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it; \ + dim_t vector_len; \ + inc_t incc, ldc; \ + inc_t incp, ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* If c needs a transposition, induce it so that we can more simply + express the remaining parameters and code. */ \ + if ( bli_does_trans( transc ) ) \ + { \ + bli_swap_incs( &rs_c, &cs_c ); \ + bli_toggle_trans( &transc ); \ + } \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool_t col_stored = bli_is_col_packed( schema ); \ + /*bool_t row_stored = bli_is_row_packed( schema );*/ \ +\ + if ( col_stored ) \ + { \ + /* Prepare to pack to a column-stored matrix. */ \ + iter_dim = n; \ + vector_len = m; \ + incc = rs_c; \ + ldc = cs_c; \ + incp = 1; \ + ldp = cs_p; \ + } \ + else /* if ( row_stored ) */ \ + { \ + /* Prepare to pack to a row-stored matrix. */ \ + iter_dim = m; \ + vector_len = n; \ + incc = cs_c; \ + ldc = rs_c; \ + incp = 1; \ + ldp = rs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim; \ +\ +\ + ctype* restrict p_begin = p_cast; \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( it = 0; it < n_iter; it += 1 ) \ + { \ + ctype* restrict c_begin = c_cast + (it )*ldc; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ +\ + { \ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + { \ + PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \ + ( \ + conjc, \ + vector_len, \ + kappa_cast, \ + c_use, incc, \ + p_use, incp, \ + cntx, \ + NULL \ + ); \ + } \ +\ + } \ +\ + p_begin += ldp; \ +\ +/* +if ( row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ + } \ +} + +INSERT_GENTFUNCR_BASIC( packm, packm_sup_var2 ) + diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h index 877601ec4..5ccdd3b76 100644 --- a/frame/3/bli_l3_sup_packm_var.h +++ b/frame/3/bli_l3_sup_packm_var.h @@ -58,3 +58,21 @@ void PASTEMAC(ch,varname) \ INSERT_GENTPROT_BASIC0( packm_sup_var1 ) +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ); + +INSERT_GENTPROT_BASIC0( packm_sup_var2 ) + diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index 657220437..2580feb77 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -360,8 +360,6 @@ void PASTEMAC(ch,varname) \ \ /* const inc_t jrstep_a = rs_a * MR; \ - ( void )jrstep_a; \ - */ \ \ const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ @@ -462,33 +460,56 @@ void PASTEMAC(ch,varname) \ mem_t mem_a = BLIS_MEM_INITIALIZER; \ mem_t mem_b = BLIS_MEM_INITIALIZER; \ \ - /* Prepare the packing destination buffer. If packing is not requested for - matrix B, this function will reduce to a no-op. */ \ - PASTEMAC(ch,packm_sup_init_mem_a) \ - ( \ - packa, \ - BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to a "panel of B". */ \ - stor_id, \ - NC, KC, MR, /* Note this "panel of B" is NC x KC. */ \ - cntx, \ - rntm, \ - &mem_a, \ - thread \ - ); \ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. + NOTE: These bszid_t values, and their order, match that of the bp + algorithm (variant 2) because they are not used to query actual + blocksizes but rather query the ways of parallelism for the various + loops. For example, the 2nd loop in variant 1 partitions in the m + dimension (in increments of MR), but parallelizes that m dimension + with BLIS_JR_NT. The only difference is that the _packa and _packb + arrays have been adjusted for the semantic difference in order in + which packa and packb nodes are encountered in the thrinfo tree. + That is, this panel-block algorithm partitions an NC x KC submatrix + of A to be packed in the 4th loop, and a KC x MC submatrix of B + to be packed in the 3rd loop. */ \ + /* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \ + bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t* restrict bszids; \ \ - /* Prepare the packing destination buffer. If packing is not requested for - matrix B, this function will reduce to a no-op. */ \ - PASTEMAC(ch,packm_sup_init_mem_b) \ - ( \ - packb, \ - BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to a "block of A". */ \ - stor_id, \ - KC, MC, NR, /* Note this "block of A" is KC x MC. */ \ - cntx, \ - rntm, \ - &mem_b, \ - thread \ - ); \ + /* Set the bszids pointer to the correct bszids array above based on which + matrices (if any) are being packed. */ \ + if ( packa ) { if ( packb ) bszids = bszids_packab; \ + else bszids = bszids_packa; } \ + else { if ( packb ) bszids = bszids_packb; \ + else bszids = bszids_nopack; } \ +\ + /* Determine whether we are using more than one thread. */ \ + const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jc = bszids; \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \ + const dim_t m_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = m_local % NC; \ \ /* Loop over the m dimension (NC rows/columns at a time). */ \ /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ @@ -594,29 +615,48 @@ void PASTEMAC(ch,varname) \ \ ctype* a_use; \ inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing A. If we won't be packing A, we alias to + the _pc variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pa; \ + if ( packa ) { bszids_pa = &bszids_pc[1]; \ + thread_pa = bli_thrinfo_sub_node( thread_pc ); } \ + else { bszids_pa = &bszids_pc[0]; \ + thread_pa = thread_pc; } \ \ /* Determine the packing buffer and related parameters for matrix A. (If A will not be packed, then a_use will be set to point to a and the _a_use strides will be set accordingly.) Then call the packm sup variant chooser, which will call the appropriate - implementation based on the schema deduced from the stor_id. */ \ + implementation based on the schema deduced from the stor_id. + NOTE: packing matrix A in this panel-block algorithm corresponds + to packing matrix B in the block-panel algorithm. */ \ PASTEMAC(ch,packm_sup_a) \ ( \ packa, \ - stor_id, \ + BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \ + stor_id, /* a "panel of B". */ \ BLIS_NO_TRANSPOSE, \ + NC, KC, /* This "panel of B" is (at most) NC x KC. */ \ nc_cur, kc_cur, MR, \ - one, \ + &one_local, \ a_pc, rs_a, cs_a, \ &a_use, &rs_a_use, &cs_a_use, \ &ps_a_use, \ cntx, \ + rntm, \ &mem_a, \ - thread \ + thread_pa \ ); \ \ /* Alias a_use so that it's clear this is our current block of - matrix B. */ \ + matrix A. */ \ ctype* restrict a_pc_use = a_use; \ \ /* We don't need to embed the panel stride of A within the auxinfo_t @@ -624,6 +664,20 @@ void PASTEMAC(ch,varname) \ which occurs here, within the macrokernel, not within the millikernel. */ \ /*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_ic = &bszids_pa[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \ + const dim_t n_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = n_local % MC; \ \ /* Loop over the n dimension (MC rows at a time). */ \ /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ @@ -712,6 +766,20 @@ void PASTEMAC(ch,varname) \ \ ctype* b_use; \ inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing A. If we won't be packing A, we alias to + the _pc variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pb; \ + if ( packb ) { bszids_pb = &bszids_ic[1]; \ + thread_pb = bli_thrinfo_sub_node( thread_ic ); } \ + else { bszids_pb = &bszids_ic[0]; \ + thread_pb = thread_ic; } \ \ /* Determine the packing buffer and related parameters for matrix B. (If B will not be packed, then b_use will be set to point to @@ -723,16 +791,19 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_sup_b) \ ( \ packb, \ - stor_id, \ + BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \ + stor_id, /* a "block of A". */ \ BLIS_NO_TRANSPOSE, \ + KC, MC, /* This "block of A" is (at most) KC x MC. */ \ kc_cur, mc_cur, NR, \ - one, \ + &one_local, \ b_ic, rs_b, cs_b, \ &b_use, &rs_b_use, &cs_b_use, \ &ps_b_use, \ cntx, \ + rntm, \ &mem_b, \ - thread \ + thread_pb \ ); \ \ /* Alias b_use so that it's clear this is our current block of @@ -744,6 +815,29 @@ void PASTEMAC(ch,varname) \ micropanels of B. */ \ bli_auxinfo_set_ps_b( ps_b_use, &aux ); \ \ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jr = &bszids_pb[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ + dim_t jr_left = nc_cur % MR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* An optimization: allow the last jr iteration to contain up to MRE + rows of C and A. (If MRE > MR, the mkernel has agreed to handle + these cases.) Note that this prevents us from declaring jr_iter and + jr_left as const. NOTE: We forgo this optimization when packing A + since packing an extended edge case is not yet supported. */ \ + if ( !packa && !is_mt ) \ + if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \ + { \ + jr_iter--; jr_left += MR; \ + } \ \ /* Loop over the m dimension (NR columns at a time). */ \ /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ @@ -1194,33 +1288,45 @@ void PASTEMAC(ch,varname) \ mem_t mem_a = BLIS_MEM_INITIALIZER; \ mem_t mem_b = BLIS_MEM_INITIALIZER; \ \ - /* Prepare the packing destination buffer. If packing is not requested for - matrix A, this function will reduce to a no-op. */ \ - PASTEMAC(ch,packm_sup_init_mem_a) \ - ( \ - packa, \ - BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to a "block of A". */ \ - stor_id, \ - MC, KC, MR, /* Note this "block of A" is MC x KC. */ \ - cntx, \ - rntm, \ - &mem_a, \ - thread \ - ); \ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + /* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \ + bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ + bszid_t* restrict bszids; \ \ - /* Prepare the packing destination buffer. If packing is not requested for - matrix B, this function will reduce to a no-op. */ \ - PASTEMAC(ch,packm_sup_init_mem_b) \ - ( \ - packb, \ - BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to a "panel of B". */ \ - stor_id, \ - KC, NC, NR, /* Note this "panel of B" is KC x NC. */ \ - cntx, \ - rntm, \ - &mem_b, \ - thread \ - ); \ + /* Set the bszids pointer to the correct bszids array above based on which + matrices (if any) are being packed. */ \ + if ( packa ) { if ( packb ) bszids = bszids_packab; \ + else bszids = bszids_packa; } \ + else { if ( packb ) bszids = bszids_packb; \ + else bszids = bszids_nopack; } \ +\ + /* Determine whether we are using more than one thread. */ \ + const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jc = bszids; \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ \ /* Loop over the n dimension (NC rows/columns at a time). */ \ /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ @@ -1324,6 +1430,20 @@ void PASTEMAC(ch,varname) \ \ ctype* b_use; \ inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing B, we alias to + the _pc variables so that code further down can unconditionally + reference the _pb variables. Note that *if* we will be packing + B, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pb; \ + if ( packb ) { bszids_pb = &bszids_pc[1]; \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); } \ + else { bszids_pb = &bszids_pc[0]; \ + thread_pb = thread_pc; } \ \ /* Determine the packing buffer and related parameters for matrix B. (If B will not be packed, then a_use will be set to point to @@ -1333,16 +1453,19 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_sup_b) \ ( \ packb, \ - stor_id, \ + BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \ + stor_id, /* a "panel of B." */ \ BLIS_NO_TRANSPOSE, \ + KC, NC, /* This "panel of B" is (at most) KC x NC. */ \ kc_cur, nc_cur, NR, \ - one, \ + &one_local, \ b_pc, rs_b, cs_b, \ &b_use, &rs_b_use, &cs_b_use, \ &ps_b_use, \ cntx, \ + rntm, \ &mem_b, \ - thread \ + thread_pb \ ); \ \ /* Alias a_use so that it's clear this is our current block of @@ -1354,6 +1477,20 @@ void PASTEMAC(ch,varname) \ which occurs here, within the macrokernel, not within the millikernel. */ \ /*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_ic = &bszids_pb[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ \ /* Loop over the m dimension (MC rows at a time). */ \ /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ @@ -1440,6 +1577,20 @@ void PASTEMAC(ch,varname) \ \ ctype* a_use; \ inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Set the bszid_t array and thrinfo_t pointer based on whether + we will be packing B. If we won't be packing A, we alias to + the _ic variables so that code further down can unconditionally + reference the _pa variables. Note that *if* we will be packing + A, the thrinfo_t node will have already been created by a + previous call to bli_thrinfo_grow(), since bszid values of + BLIS_NO_PART cause the tree to grow by two (e.g. to the next + bszid that is a normal bszid_t value). */ \ + bszid_t* restrict bszids_pa; \ + if ( packa ) { bszids_pa = &bszids_ic[1]; \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); } \ + else { bszids_pa = &bszids_ic[0]; \ + thread_pa = thread_ic; } \ \ /* Determine the packing buffer and related parameters for matrix A. (If A will not be packed, then a_use will be set to point to @@ -1449,16 +1600,19 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_sup_a) \ ( \ packa, \ - stor_id, \ + BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \ + stor_id, /* a "block of A." */ \ BLIS_NO_TRANSPOSE, \ + MC, KC, /* This "block of A" is (at most) MC x KC. */ \ mc_cur, kc_cur, MR, \ - one, \ + &one_local, \ a_ic, rs_a, cs_a, \ &a_use, &rs_a_use, &cs_a_use, \ &ps_a_use, \ cntx, \ + rntm, \ &mem_a, \ - thread \ + thread_pa \ ); \ \ /* Alias a_use so that it's clear this is our current block of @@ -1469,6 +1623,30 @@ void PASTEMAC(ch,varname) \ millikernel will query and use this to iterate through micropanels of A (if needed). */ \ bli_auxinfo_set_ps_a( ps_a_use, &aux ); \ +\ + /* Grow the thrinfo_t tree. */ \ + bszid_t* restrict bszids_jr = &bszids_pa[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* An optimization: allow the last jr iteration to contain up to NRE + columns of C and B. (If NRE > NR, the mkernel has agreed to handle + these cases.) Note that this prevents us from declaring jr_iter and + jr_left as const. NOTE: We forgo this optimization when packing B + since packing an extended edge case is not yet supported. */ \ + if ( !packb && !is_mt ) \ + if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \ + { \ + jr_iter--; jr_left += NR; \ + } \ \ /* Loop over the n dimension (NR columns at a time). */ \ /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index 3ff4eb3a0..0bf3ad854 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -125,7 +125,7 @@ void bli_l3_thread_decorator // Alias thread-local copies of A, B, and C. These will be the objects // we pass down the algorithmic function stack. Making thread-local - // alaises is highly recommended in case a thread needs to change any + // aliases is highly recommended in case a thread needs to change any // of the properties of an object without affecting other threads' // objects. bli_obj_alias_to( a, &a_t ); diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c index d0b48d7e2..a2c453062 100644 --- a/frame/thread/bli_l3_decor_pthreads.c +++ b/frame/thread/bli_l3_decor_pthreads.c @@ -96,7 +96,7 @@ void* bli_l3_thread_entry( void* data_void ) // Alias thread-local copies of A, B, and C. These will be the objects // we pass down the algorithmic function stack. Making thread-local - // alaises is highly recommended in case a thread needs to change any + // aliases is highly recommended in case a thread needs to change any // of the properties of an object without affecting other threads' // objects. bli_obj_alias_to( a, &a_t ); diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h index 01dfb8f6e..a001e5b74 100644 --- a/frame/thread/bli_l3_sup_decor.h +++ b/frame/thread/bli_l3_sup_decor.h @@ -48,7 +48,6 @@ typedef err_t (*l3supint_t) obj_t* c, cntx_t* cntx, rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ); @@ -57,8 +56,6 @@ err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, - //pack_t schema_a, - //pack_t schema_b, obj_t* alpha, obj_t* a, obj_t* b, diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c index 7808c1333..4baacd1ea 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.c +++ b/frame/thread/bli_l3_sup_decor_openmp.c @@ -40,16 +40,14 @@ // Define a dummy function bli_l3_sup_thread_entry(), which is needed in the // pthreads version, so that when building Windows DLLs (with OpenMP enabled // or no multithreading) we don't risk having an unresolved symbol. -//void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; } - +void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; } +//#define PRINT_THRINFO err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, - //pack_t schema_a, - //pack_t schema_b, obj_t* alpha, obj_t* a, obj_t* b, @@ -59,36 +57,8 @@ err_t bli_l3_sup_thread_decorator rntm_t* rntm ) { -#if 0 - - return - bli_gemmsup_int - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm, - 0 - ); - -#else - - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. - //pack_t schema_a = bli_obj_pack_schema( a ); - //pack_t schema_b = bli_obj_pack_schema( b ); - //bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); - //bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); - - // For sequential execution, we use only one thread. - const dim_t n_threads = 1; + // Query the total number of threads from the rntm_t object. + const dim_t n_threads = bli_rntm_num_threads( rntm ); // NOTE: The sba was initialized in bli_init(). @@ -99,55 +69,45 @@ err_t bli_l3_sup_thread_decorator array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we can create the global comm below. + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. bli_sba_rntm_set_pool( 0, array, rntm ); - // Set the packing block allocator field of the rntm. + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. bli_membrk_rntm_set_membrk( rntm ); -#if 0 // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); -#endif + _Pragma( "omp parallel num_threads(n_threads)" ) { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* restrict rntm_p = rntm; + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; - cntl_t* cntl_use = NULL; - //thrinfo_t* thread = NULL; - thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED; + // Query the thread's id from OpenMP. + const dim_t tid = omp_get_thread_num(); - const dim_t tid = 0; + // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // NOTE: This calls the same function used for the conventional/large + // code path. + bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. - // NOTE: This is commented out because, in the single-threaded case, - // this is redundant since it's already been done above. - //bli_sba_rntm_set_pool( tid, array, rntm_p ); + bli_sba_rntm_set_pool( tid, array, rntm_p ); - // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't - // need to alias objects for A, B, and C since they were already aliased - // in bli_*_front(). However, we may add aliasing here in the future so - // that, with all three (_single.c, _openmp.c, _pthreads.c) implementations - // consistently providing local aliases, we can then eliminate aliasing - // elsewhere. - - // Create a default control tree for the operation, if needed. - //bli_l3_cntl_create_if( family, schema_a, schema_b, - // a, b, c, rntm_p, cntl, &cntl_use ); -#if 0 - cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b ); + thrinfo_t* thread = NULL; // Create the root node of the thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); -#endif - - ( void )tid; + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); func ( @@ -158,23 +118,16 @@ err_t bli_l3_sup_thread_decorator c, cntx, rntm_p, - cntl_use, thread ); -#if 0 - // Free the thread's local control tree. - //bli_l3_cntl_free( rntm_p, cntl_use, thread ); - bli_gemm_cntl_free( rntm_p, cntl_use, thread ); - // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); -#endif + bli_l3_sup_thrinfo_free( rntm_p, thread ); } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). + // (called from the thread entry function). // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure @@ -182,8 +135,6 @@ err_t bli_l3_sup_thread_decorator bli_sba_checkin_array( array ); return BLIS_SUCCESS; - -#endif } #endif diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c index 7cbf65381..5b23bc86b 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.c +++ b/frame/thread/bli_l3_sup_decor_pthreads.c @@ -37,12 +37,82 @@ #ifdef BLIS_ENABLE_PTHREADS +// A data structure to assist in passing operands to additional threads. +typedef struct thread_data +{ + l3supint_t func; + opid_t family; + obj_t* alpha; + obj_t* a; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + rntm_t* rntm; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; +} thread_data_t; + +// Entry point for additional threads +void* bli_l3_sup_thread_entry( void* data_void ) +{ + thread_data_t* data = data_void; + + l3supint_t func = data->func; + opid_t family = data->family; + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + dim_t tid = data->tid; + array_t* array = data->array; + thrcomm_t* gl_comm = data->gl_comm; + + ( void )family; + + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + + return NULL; +} + err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, - //pack_t schema_a, - //pack_t schema_b, obj_t* alpha, obj_t* a, obj_t* b, @@ -52,36 +122,8 @@ err_t bli_l3_sup_thread_decorator rntm_t* rntm ) { -#if 0 - - return - bli_gemmsup_int - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm, - 0 - ); - -#else - - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. - //pack_t schema_a = bli_obj_pack_schema( a ); - //pack_t schema_b = bli_obj_pack_schema( b ); - //bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); - //bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); - - // For sequential execution, we use only one thread. - const dim_t n_threads = 1; + // Query the total number of threads from the context. + const dim_t n_threads = bli_rntm_num_threads( rntm ); // NOTE: The sba was initialized in bli_init(). @@ -92,91 +134,82 @@ err_t bli_l3_sup_thread_decorator array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we can create the global comm below. + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. bli_sba_rntm_set_pool( 0, array, rntm ); - // Set the packing block allocator field of the rntm. + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. bli_membrk_rntm_set_membrk( rntm ); -#if 0 - // Allcoate a global communicator for the root thrinfo_t structures. + // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); -#endif + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* restrict rntm_p = rntm; + // Set up thread data for additional threads (beyond thread 0). + datas[tid].func = func; + datas[tid].family = family; + datas[tid].alpha = alpha; + datas[tid].a = a; + datas[tid].b = b; + datas[tid].beta = beta; + datas[tid].c = c; + datas[tid].cntx = cntx; + datas[tid].rntm = rntm; + datas[tid].tid = tid; + datas[tid].gl_comm = gl_comm; + datas[tid].array = array; - cntl_t* cntl_use = NULL; - //thrinfo_t* thread = NULL; - thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED; - - const dim_t tid = 0; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - // NOTE: This is commented out because, in the single-threaded case, - // this is redundant since it's already been done above. - //bli_sba_rntm_set_pool( tid, array, rntm_p ); - - // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't - // need to alias objects for A, B, and C since they were already aliased - // in bli_*_front(). However, we may add aliasing here in the future so - // that, with all three (_single.c, _openmp.c, _pthreads.c) implementations - // consistently providing local aliases, we can then eliminate aliasing - // elsewhere. - - // Create a default control tree for the operation, if needed. - //bli_l3_cntl_create_if( family, schema_a, schema_b, - // a, b, c, rntm_p, cntl, &cntl_use ); -#if 0 - cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b ); - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); -#endif - - ( void )tid; - - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm_p, - cntl_use, - thread - ); - -#if 0 - // Free the thread's local control tree. - //bli_l3_cntl_free( rntm_p, cntl_use, thread ); - bli_gemm_cntl_free( rntm_p, cntl_use, thread ); - - // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); -#endif + // Spawn additional threads for ids greater than 1. + if ( tid != 0 ) + bli_pthread_create( &pthreads[tid], NULL, &bli_l3_sup_thread_entry, &datas[tid] ); + else + bli_l3_sup_thread_entry( ( void* )(&datas[0]) ); } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). + // (called from the thread entry function). + + // Thread 0 waits for additional threads to finish. + for ( dim_t tid = 1; tid < n_threads; tid++ ) + { + bli_pthread_join( pthreads[tid], NULL ); + } // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); - return BLIS_SUCCESS; + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( pthreads ); -#endif + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( datas ); + + return BLIS_SUCCESS; } #endif diff --git a/test/supmt/Makefile b/test/supmt/Makefile new file mode 100644 index 000000000..2ed93565d --- /dev/null +++ b/test/supmt/Makefile @@ -0,0 +1,580 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2019, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# Makefile +# +# Field G. Van Zee +# +# Makefile for standalone BLIS test drivers. +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all all-st all-mt \ + blis blis-st blis-mt \ + clean cleanx + + + +# +# --- Determine makefile fragment location ------------------------------------- +# + +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +ifneq ($(strip $(BLIS_INSTALL_PATH)),) +LIB_PATH := $(BLIS_INSTALL_PATH)/lib +INC_PATH := $(BLIS_INSTALL_PATH)/include/blis +SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis +else +DIST_PATH := ../.. +LIB_PATH = ../../lib/$(CONFIG_NAME) +INC_PATH = ../../include/$(CONFIG_NAME) +SHARE_PATH := ../.. +endif + + + +# +# --- Include common makefile definitions -------------------------------------- +# + +# Include the common makefile fragment. +-include $(SHARE_PATH)/common.mk + + + +# +# --- BLAS and LAPACK implementations ------------------------------------------ +# + +# BLIS library and header path. This is simply wherever it was installed. +#BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib +#BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis + +# BLIS library. +#BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a + +# BLAS library path(s). This is where the BLAS libraries reside. +HOME_LIB_PATH := $(HOME)/flame/lib +MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 + +# netlib BLAS +NETLIB_LIB := $(HOME_LIB_PATH)/libblas.a + +# OpenBLAS +OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a +OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a + +# BLASFEO +BLASFEO_LIB := $(HOME_LIB_PATH)/libblasfeo.a + +# libxsmm +LIBXSMM_LIB := $(HOME_LIB_PATH)/libxsmm.a -ldl \ + $(NETLIB_LIB) -lgfortran + +# ATLAS +ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \ + $(HOME_LIB_PATH)/libatlas.a + +# Eigen +EIGEN_INC := $(HOME)/flame/eigen/include/eigen3 +EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a +EIGENP_LIB := $(EIGEN_LIB) + +# MKL +MKL_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_sequential \ + -lpthread -lm -ldl +MKLP_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_gnu_thread \ + -lpthread -lm -ldl -fopenmp + #-L$(ICC_LIB_PATH) \ + #-lgomp + +VENDOR_LIB := $(MKL_LIB) +VENDORP_LIB := $(MKLP_LIB) + + +# +# --- Problem size definitions ------------------------------------------------- +# + +# Single core +PS_BEGIN := 4 +PS_MAX := 800 +PS_INC := 4 + +# Multicore +P1_BEGIN := 8 +P1_MAX := 1600 +P1_INC := 8 + + +# +# --- General build definitions ------------------------------------------------ +# + +TEST_SRC_PATH := . +TEST_OBJ_PATH := . + +# Gather all local object files. +TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \ + $(TEST_OBJ_PATH)/%.o, \ + $(wildcard $(TEST_SRC_PATH)/*.c))) + +# Override the value of CINCFLAGS so that the value of CFLAGS returned by +# get-frame-cflags-for() is not cluttered up with include paths needed only +# while building BLIS. +CINCFLAGS := -I$(INC_PATH) + +# Use the "framework" CFLAGS for the configuration family. +CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME)) + +# Add local header paths to CFLAGS. +CFLAGS += -I$(TEST_SRC_PATH) + +# Locate the libblis library to which we will link. +LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L) + +# Define a set of CFLAGS for use with C++ and Eigen. +CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS)) +CXXFLAGS += -I$(EIGEN_INC) + +# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading. +CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS)) +CXXFLAGS_MT := -march=native $(CXXFLAGS) + +# Single or multithreaded string +STR_ST := -DTHR_STR=\"st\" +STR_MT := -DTHR_STR=\"mt\" + +# Number of trials per problem size. +N_TRIALS := -DN_TRIALS=3 + +# Problem size specification +PDEF_ST := -DP_BEGIN=$(PS_BEGIN) \ + -DP_MAX=$(PS_MAX) \ + -DP_INC=$(PS_INC) + +PDEF_MT := -DP_BEGIN=$(P1_BEGIN) \ + -DP_MAX=$(P1_MAX) \ + -DP_INC=$(P1_INC) + +ifeq ($(E),1) +ERRCHK := -DERROR_CHECK +else +ERRCHK := -DNO_ERROR_CHECK +endif + +# Enumerate possible datatypes and computation precisions. +#dts := s d c z +DTS := d + +TRANS := n_n \ + n_t \ + t_n \ + t_t + +# While BLIS supports all combinations of row and column storage for matrices +# C, A, and B, the alternatives mostly only support CBLAS APIs, which inherently +# support only "all row-storage" or "all column-storage". Thus, we disable the +# building of those other drivers so that compilation/linking completes sooner. +#STORS := r_r_r \ +# r_r_c \ +# r_c_r \ +# r_c_c \ +# c_r_r \ +# c_r_c \ +# c_c_r \ +# c_c_c +STORS := r_r_r \ + c_c_c + + +SHAPES := l_l_s \ + l_s_l \ + s_l_l \ + s_s_l \ + s_l_s \ + l_s_s \ + l_l_l + +SMS := 6 +SNS := 8 +SKS := 10 + + +# +# --- Function definitions ----------------------------------------------------- +# + +# A function to strip the underscores from a list of strings. +stripu = $(subst _,,$(1)) + +# Various functions that help us construct the datatype combinations and then +# extract the needed datatype strings and C preprocessor define flags. +get-1of2 = $(word 1,$(subst _, ,$(1))) +get-2of2 = $(word 2,$(subst _, ,$(1))) + +get-1of3 = $(word 1,$(subst _, ,$(1))) +get-2of3 = $(word 2,$(subst _, ,$(1))) +get-3of3 = $(word 3,$(subst _, ,$(1))) + +# Datatype defs. +get-dt-cpp = $(strip \ + $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\ + $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\ + $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\ + -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX)))) + +# Transpose defs. +get-tra-defs-a = $(strip $(subst n,-DTRANSA=BLIS_NO_TRANSPOSE -DA_NOTRANS, \ + $(subst t,-DTRANSA=BLIS_TRANSPOSE -DA_TRANS,$(call get-1of2,$(1))))) +get-tra-defs-b = $(strip $(subst n,-DTRANSB=BLIS_NO_TRANSPOSE -DB_NOTRANS, \ + $(subst t,-DTRANSB=BLIS_TRANSPOSE -DB_TRANS,$(call get-2of2,$(1))))) +get-tra-defs = $(call get-tra-defs-a,$(1)) $(call get-tra-defs-b,$(1)) + +# Storage defs. +get-sto-uch-a = $(strip $(subst r,R, \ + $(subst c,C,$(call get-1of3,$(1))))) +get-sto-uch-b = $(strip $(subst r,R, \ + $(subst c,C,$(call get-2of3,$(1))))) +get-sto-uch-c = $(strip $(subst r,R, \ + $(subst c,C,$(call get-3of3,$(1))))) +get-sto-defs = $(strip \ + -DSTOR3=BLIS_$(call get-sto-uch-a,$(1))$(call get-sto-uch-b,$(1))$(call get-sto-uch-c,$(1)) \ + -DA_STOR_$(call get-sto-uch-a,$(1)) \ + -DB_STOR_$(call get-sto-uch-b,$(1)) \ + -DC_STOR_$(call get-sto-uch-c,$(1))) + +# Dimension defs. +get-shape-defs-cm = $(if $(findstring l,$(1)),-DM_DIM=-1,-DM_DIM=$(2)) +get-shape-defs-cn = $(if $(findstring l,$(1)),-DN_DIM=-1,-DN_DIM=$(2)) +get-shape-defs-ck = $(if $(findstring l,$(1)),-DK_DIM=-1,-DK_DIM=$(2)) +get-shape-defs-m = $(call get-shape-defs-cm,$(call get-1of3,$(1)),$(2)) +get-shape-defs-n = $(call get-shape-defs-cn,$(call get-2of3,$(1)),$(2)) +get-shape-defs-k = $(call get-shape-defs-ck,$(call get-3of3,$(1)),$(2)) + +# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk +get-shape-defs = $(strip $(call get-shape-defs-m,$(1),$(2)) \ + $(call get-shape-defs-n,$(1),$(3)) \ + $(call get-shape-defs-k,$(1),$(4))) + +#$(error l_l_s 6 8 4 = $(call get-shape-defs,l_l_s,6,8,4)) + +# Shape-dimension string. +get-shape-str-ch = $(if $(findstring l,$(1)),p,$(2)) +get-shape-str-m = $(call get-shape-str-ch,$(call get-1of3,$(1)),$(2)) +get-shape-str-n = $(call get-shape-str-ch,$(call get-2of3,$(1)),$(2)) +get-shape-str-k = $(call get-shape-str-ch,$(call get-3of3,$(1)),$(2)) + +# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk +get-shape-dim-str = m$(call get-shape-str-m,$(1),$(2))n$(call get-shape-str-n,$(1),$(3))k$(call get-shape-str-k,$(1),$(4)) + +# Implementation defs. +# Define a function to return the appropriate -DSTR= and -D[BLIS|BLAS] flags. +get-imp-defs = $(strip $(subst blissup,-DSTR=\"$(1)\" -DBLIS -DSUP, \ + $(subst blislpab,-DSTR=\"$(1)\" -DBLIS, \ + $(subst eigen,-DSTR=\"$(1)\" -DEIGEN, \ + $(subst openblas,-DSTR=\"$(1)\" -DCBLAS, \ + $(subst blasfeo,-DSTR=\"$(1)\" -DCBLAS, \ + $(subst libxsmm,-DSTR=\"$(1)\" -DBLAS -DXSMM, \ + $(subst vendor,-DSTR=\"$(1)\" -DCBLAS,$(1))))))))) + +TRANS0 = $(call stripu,$(TRANS)) +STORS0 = $(call stripu,$(STORS)) + +# Limit BLAS and Eigen to only using all row-stored, or all column-stored matrices. +# Also, limit libxsmm to using all column-stored matrices since it does not offer +# CBLAS interfaces. +BSTORS0 = rrr ccc +ESTORS0 = rrr ccc +XSTORS0 = ccc + + +# +# --- Object and binary file definitons ---------------------------------------- +# + +get-st-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_st.o))))))) + +# Build a list of object files and binaries for each single-threaded +# implementation using the get-st-objs() function defined above. +BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup) +BLISSUP_ST_BINS := $(patsubst %.o,%.x,$(BLISSUP_ST_OBJS)) + +BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab) +BLISLPAB_ST_BINS := $(patsubst %.o,%.x,$(BLISLPAB_ST_OBJS)) + +EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen) +EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS)) + +OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas) +OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) + +BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo) +BLASFEO_ST_BINS := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS)) + +LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm) +LIBXSMM_ST_BINS := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS)) + +VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor) +VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) + +# Mark the object files as intermediate so that make will remove them +# automatically after building the binaries on which they depend. +.INTERMEDIATE: $(BLISSUP_ST_OBJS) \ + $(BLISLPAB_ST_OBJS) \ + $(EIGEN_ST_OBJS) \ + $(OPENBLAS_ST_OBJS) \ + $(BLASFEO_ST_OBJS) \ + $(LIBXSMM_ST_OBJS) \ + $(VENDOR_ST_OBJS) + +get-mt-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_mt.o))))))) + +# Build a list of object files and binaries for each multithreaded +# implementation using the get-st-objs() function defined above. +BLISSUP_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup) +BLISSUP_MT_BINS := $(patsubst %.o,%.x,$(BLISSUP_MT_OBJS)) + +BLISLPAB_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab) +BLISLPAB_MT_BINS := $(patsubst %.o,%.x,$(BLISLPAB_MT_OBJS)) + +EIGEN_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen) +EIGEN_MT_BINS := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS)) + +OPENBLAS_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas) +OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS)) + +VENDOR_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor) +VENDOR_MT_BINS := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS)) + +#$(error "objs = $(EIGEN_ST_BINS)" ) + +# Mark the object files as intermediate so that make will remove them +# automatically after building the binaries on which they depend. +.INTERMEDIATE: $(BLISSUP_MT_OBJS) \ + $(BLISLPAB_MT_OBJS) \ + $(EIGEN_MT_OBJS) \ + $(OPENBLAS_MT_OBJS) \ + $(VENDOR_MT_OBJS) + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +all: st + +blis: blissup-st blislpab-st + +blissup: blissup-st +blislpab: blislpab-st +eigen: eigen-st +openblas: openblas-st +blasfeo: blasfeo-st +libxsmm: libxsmm-st +vendor: vendor-st + +st: blissup-st blislpab-st \ + eigen-st openblas-st blasfeo-st libxsmm-st vendor-st + +blissup-st: $(BLISSUP_ST_BINS) +blislpab-st: $(BLISLPAB_ST_BINS) +eigen-st: $(EIGEN_ST_BINS) +openblas-st: $(OPENBLAS_ST_BINS) +blasfeo-st: $(BLASFEO_ST_BINS) +libxsmm-st: $(LIBXSMM_ST_BINS) +vendor-st: $(VENDOR_ST_BINS) + +mt: blissup-mt blislpab-mt \ + eigen-mt openblas-mt vendor-mt + +blissup-mt: $(BLISSUP_MT_BINS) +blislpab-mt: $(BLISLPAB_MT_BINS) +eigen-mt: $(EIGEN_MT_BINS) +openblas-mt: $(OPENBLAS_MT_BINS) +vendor-mt: $(VENDOR_MT_BINS) + + + +# --Object file rules -- + +# Define the implementations for which we will instantiate compilation rules. +BIMPLS_ST := blissup blislpab openblas blasfeo libxsmm vendor +BIMPLS_MT := blissup blislpab openblas vendor +EIMPLS := eigen + +# 1 2 3 4 567 8 +# test_dgemm_nn_rrr_mpn6kp_blissup_st.x + +# Define the function that will be used to instantiate compilation rules +# for the various single-threaded implementations. +define make-st-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile + $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ +endef + +# Instantiate the rule function make-st-rule() for each BLIS/BLAS/CBLAS +# implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS), \ +$(foreach sn,$(SNS), \ +$(foreach sk,$(SKS), \ +$(foreach impl,$(BIMPLS_ST), \ +$(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + +# Define the function that will be used to instantiate compilation rules +# for the various multithreaded implementations. +define make-mt-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile + $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ +endef + +# Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS +# implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS), \ +$(foreach sn,$(SNS), \ +$(foreach sk,$(SKS), \ +$(foreach impl,$(BIMPLS_MT), \ +$(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + +# Define the function that will be used to instantiate compilation rules +# for the single-threaded Eigen implementation. +define make-eigst-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile + $(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ +endef + +# Instantiate the rule function make-st-rule() for each Eigen implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS), \ +$(foreach sn,$(SNS), \ +$(foreach sk,$(SKS), \ +$(foreach impl,$(EIMPLS), \ +$(eval $(call make-eigst-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + +# Define the function that will be used to instantiate compilation rules +# for the multithreaded Eigen implementation. +define make-eigmt-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile + $(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ +endef + +# Instantiate the rule function make-st-rule() for each Eigen implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS), \ +$(foreach sn,$(SNS), \ +$(foreach sk,$(SKS), \ +$(foreach impl,$(EIMPLS), \ +$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + + +# -- Executable file rules -- + +# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS +# on the link command line in case BLIS was configured with the BLAS +# compatibility layer. This prevents BLIS from inadvertently getting called +# for the BLAS routines we are trying to test with. + +test_%_blissup_st.x: test_%_blissup_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_blislpab_st.x: test_%_blislpab_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_eigen_st.x: test_%_eigen_st.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_blasfeo_st.x: test_%_blasfeo_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(BLASFEO_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_libxsmm_st.x: test_%_libxsmm_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBXSMM_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_blissup_mt.x: test_%_blissup_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_blislpab_mt.x: test_%_blislpab_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_eigen_mt.x: test_%_eigen_mt.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_vendor_mt.x: test_%_vendor_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +# -- Clean rules -- + +clean: cleanx + +cleanx: + - $(RM_F) *.x *.o + diff --git a/test/supmt/octave/gen_opsupnames.m b/test/supmt/octave/gen_opsupnames.m new file mode 100644 index 000000000..2debd4142 --- /dev/null +++ b/test/supmt/octave/gen_opsupnames.m @@ -0,0 +1,52 @@ +function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims ) + +nops = size( ops, 1 ); + +smallm = smalldims( 1 ); +smalln = smalldims( 2 ); +smallk = smalldims( 3 ); + +i = 1; + +for io = 1:nops + + op = ops( io, : ); + + str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm ); + str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln ); + str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ); + str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk ); + str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk ); + str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln ); + str6 = sprintf( '%s_%s_mpnpkp', op, stor ); + + %opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm ) + %opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln ) + %opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ) + %opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk ) + %opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk ) + %opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln ) + %opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor ) + + opsupnames( i+0, : ) = sprintf( '%-20s', str0 ); + opsupnames( i+1, : ) = sprintf( '%-20s', str1 ); + opsupnames( i+2, : ) = sprintf( '%-20s', str2 ); + opsupnames( i+3, : ) = sprintf( '%-20s', str3 ); + opsupnames( i+4, : ) = sprintf( '%-20s', str4 ); + opsupnames( i+5, : ) = sprintf( '%-20s', str5 ); + opsupnames( i+6, : ) = sprintf( '%-20s', str6 ); + + opnames( i+0, : ) = sprintf( '%s', op ); + opnames( i+1, : ) = sprintf( '%s', op ); + opnames( i+2, : ) = sprintf( '%s', op ); + opnames( i+3, : ) = sprintf( '%s', op ); + opnames( i+4, : ) = sprintf( '%s', op ); + opnames( i+5, : ) = sprintf( '%s', op ); + opnames( i+6, : ) = sprintf( '%s', op ); + + i = i + 7; +end + +r_val1 = opsupnames; +r_val2 = opnames; + diff --git a/test/supmt/octave/plot_l3sup_perf.m b/test/supmt/octave/plot_l3sup_perf.m new file mode 100644 index 000000000..28056e25a --- /dev/null +++ b/test/supmt/octave/plot_l3sup_perf.m @@ -0,0 +1,250 @@ +function r_val = plot_l3sup_perf( opname, ... + data_blissup, ... + data_blislpab, ... + data_eigen, ... + data_open, ... + data_vend, vend_str, ... + nth, ... + rows, cols, ... + cfreq, ... + dfps, ... + theid, impl ) +%if ... %mod(theid-1,cols) == 2 || ... +% ... %mod(theid-1,cols) == 3 || ... +% ... %mod(theid-1,cols) == 4 || ... +% 0 == 1 ... %theid >= 19 +% show_plot = 0; +%else + show_plot = 1; +%end + +%legend_plot_id = 11; +legend_plot_id = 1*cols + 1*5; + +if 1 +ax1 = subplot( rows, cols, theid ); +hold( ax1, 'on' ); +end + +% Set line properties. +color_blissup = 'k'; lines_blissup = '-'; markr_blissup = ''; +color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = ''; +color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o'; +color_open = 'r'; lines_open = '--'; markr_open = 'o'; +color_vend = 'b'; lines_vend = '-.'; markr_vend = '.'; + +% Compute the peak performance in terms of the number of double flops +% executable per cycle and the clock rate. +if opname(1) == 's' || opname(1) == 'c' + flopspercycle = dfps * 2; +else + flopspercycle = dfps; +end +max_perf_core = (flopspercycle * cfreq) * 1; + +% Escape underscores in the title. +title_opname = strrep( opname, '_', '\_' ); + +% Print the title to a string. +titlename = '%s'; +titlename = sprintf( titlename, title_opname ); + +% Set the legend strings. +blissup_legend = sprintf( 'BLIS sup' ); +blislpab_legend = sprintf( 'BLIS conv' ); +eigen_legend = sprintf( 'Eigen' ); +open_legend = sprintf( 'OpenBLAS' ); +%vend_legend = sprintf( 'MKL' ); +%vend_legend = sprintf( 'ARMPL' ); +vend_legend = vend_str; + +% Set axes range values. +y_scale = 1.00; +x_begin = 0; +%x_end is set below. +y_begin = 0; +y_end = max_perf_core * y_scale; + +% Set axes names. +if nth == 1 + yaxisname = 'GFLOPS'; +else + yaxisname = 'GFLOPS/core'; +end + + +%flopscol = 4; +flopscol = size( data_blissup, 2 ); +msize = 5; +if 1 +fontsize = 11; +else +fontsize = 16; +end +linesize = 0.5; +legend_loc = 'southeast'; + +% -------------------------------------------------------------------- + +% Automatically detect a column with the increasing problem size. +% Then set the maximum x-axis value. +for psize_col = 1:3 + if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col ) + break; + end +end +x_axis( :, 1 ) = data_blissup( :, psize_col ); + +% Compute the number of data points we have in the x-axis. Note that +% we only use quarter the data points for the m = n = k column of graphs. +if mod(theid-1,cols) == 6 + np = size( data_blissup, 1 ) / 4; +else + np = size( data_blissup, 1 ); +end + +% Grab the last x-axis value. +x_end = data_blissup( np, psize_col ); + +%data_peak( 1, 1:2 ) = [ 0 max_perf_core ]; +%data_peak( 2, 1:2 ) = [ x_end max_perf_core ]; + +if show_plot == 1 +blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ... + 'Color',color_blissup, 'LineStyle',lines_blissup, ... + 'LineWidth',linesize ); +blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ... + 'Color',color_blislpab, 'LineStyle',lines_blislpab, ... + 'LineWidth',linesize ); +eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ... + 'Color',color_eigen, 'LineStyle',lines_eigen, ... + 'LineWidth',linesize ); +open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ... + 'Color',color_open, 'LineStyle',lines_open, ... + 'LineWidth',linesize ); +vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ... + 'Color',color_vend, 'LineStyle',lines_vend, ... + 'LineWidth',linesize ); +else +if theid == legend_plot_id +blissup_ln = line( nan, nan, ... + 'Color',color_blissup, 'LineStyle',lines_blissup, ... + 'LineWidth',linesize ); +blislpab_ln = line( nan, nan, ... + 'Color',color_blislpab, 'LineStyle',lines_blislpab, ... + 'LineWidth',linesize ); +eigen_ln = line( nan, nan, ... + 'Color',color_eigen, 'LineStyle',lines_eigen, ... + 'LineWidth',linesize ); +open_ln = line( nan, nan, ... + 'Color',color_open, 'LineStyle',lines_open, ... + 'LineWidth',linesize ); +vend_ln = line( nan, nan, ... + 'Color',color_vend, 'LineStyle',lines_vend, ... + 'LineWidth',linesize ); +end +end + + +xlim( ax1, [x_begin x_end] ); +ylim( ax1, [y_begin y_end] ); + +if 6000 <= x_end && x_end < 10000 + x_tick2 = x_end - 2000; + x_tick1 = x_tick2/2; + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 4000 <= x_end && x_end < 6000 + x_tick2 = x_end - 1000; + x_tick1 = x_tick2/2; + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 2000 <= x_end && x_end < 3000 + x_tick2 = x_end - 400; + x_tick1 = x_tick2/2; + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 500 <= x_end && x_end < 1000 + x_tick3 = x_end*(3/4); + x_tick2 = x_end*(2/4); + x_tick1 = x_end*(1/4); + xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] ); +end + +if show_plot == 1 || theid == legend_plot_id + if theid == legend_plot_id + leg = legend( ... + [ ... + blissup_ln ... + blislpab_ln ... + eigen_ln ... + open_ln ... + vend_ln ... + ], ... + blissup_legend, ... + blislpab_legend, ... + eigen_legend, ... + open_legend, ... + vend_legend, ... + 'Location', legend_loc ); + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + if impl == 'octave' + set( leg,'FontSize',fontsize ); + set( leg,'Position',[12.50 10.35 1.5 0.9 ] ); % (1,4tl) + else + set( leg,'FontSize',fontsize-1 ); + set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl) + end + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + % xpos ypos + %set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl) + end +end + +set( ax1,'FontSize',fontsize ); +set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1. +box( ax1, 'on' ); + +titl = title( titlename ); +set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'. + +if impl == 'octave' +tpos = get( titl, 'Position' ); % default is to align across whole figure, not box. +tpos(1) = tpos(1) + -40; +set( titl, 'Position', tpos ); % here we nudge it back to centered with box. +end + +if theid > (rows-1)*cols +%xlab = xlabel( ax1,xaxisname ); +%tpos = get( xlab, 'Position' ) +%tpos(2) = tpos(2) + 10; +%set( xlab, 'Position', tpos ); + if theid == rows*cols - 6 + xlab = xlabel( ax1, 'm = 6; n = k' ); + elseif theid == rows*cols - 5 + xlab = xlabel( ax1, 'n = 8; m = k' ); + elseif theid == rows*cols - 4 + xlab = xlabel( ax1, 'k = 10; m = n' ); + elseif theid == rows*cols - 3 + xlab = xlabel( ax1, 'm; n = 8, k = 10' ); + elseif theid == rows*cols - 2 + xlab = xlabel( ax1, 'n; m = 6, k = 10' ); + elseif theid == rows*cols - 1 + xlab = xlabel( ax1, 'k; m = 6, n = 8' ); + elseif theid == rows*cols - 0 + xlab = xlabel( ax1, 'm = n = k' ); + end +end + +if mod(theid-1,cols) == 0 +ylab = ylabel( ax1,yaxisname ); +end + +%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' ); +%saveas( fig, filename_png ); + +%hold( ax1, 'off' ); + +r_val = 0; + diff --git a/test/supmt/octave/plot_panel_trxsh.m b/test/supmt/octave/plot_panel_trxsh.m new file mode 100644 index 000000000..cf5e860a5 --- /dev/null +++ b/test/supmt/octave/plot_panel_trxsh.m @@ -0,0 +1,152 @@ +function r_val = plot_panel_trxsh ... + ( ... + cfreq, ... + dflopspercycle, ... + nth, ... + thr_str, ... + dt_ch, ... + stor_str, ... + smalldims, ... + dirpath, ... + arch_str, ... + vend_str, ... + impl ... + ) + +%cfreq = 1.8; +%dflopspercycle = 32; + +% Create filename "templates" for the files that contain the performance +% results. +filetemp_blissup = '%s/output_%s_%s_blissup.m'; +filetemp_blislpab = '%s/output_%s_%s_blislpab.m'; +filetemp_eigen = '%s/output_%s_%s_eigen.m'; +filetemp_open = '%s/output_%s_%s_openblas.m'; +filetemp_vend = '%s/output_%s_%s_vendor.m'; + +% Create a variable name "template" for the variables contained in the +% files outlined above. +vartemp = 'data_%s_%s_%s( :, : )'; + +% Define the datatypes and operations we will be plotting. +oproot = sprintf( '%cgemm', dt_ch ); +ops( 1, : ) = sprintf( '%s_nn', oproot ); +ops( 2, : ) = sprintf( '%s_nt', oproot ); +ops( 3, : ) = sprintf( '%s_tn', oproot ); +ops( 4, : ) = sprintf( '%s_tt', oproot ); + +% Generate datatype-specific operation names from the set of operations +% and datatypes. +[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims ); +n_opsupnames = size( opsupnames, 1 ); + +%opsupnames +%opnames +%return + +if 1 == 1 + %fig = figure('Position', [100, 100, 2400, 1500]); + fig = figure('Position', [100, 100, 2400, 1200]); + orient( fig, 'portrait' ); + set(gcf,'PaperUnits', 'inches'); + if impl == 'matlab' + set(gcf,'PaperSize', [11.5 20.4]); + set(gcf,'PaperPosition', [0 0 11.5 20.4]); + set(gcf,'PaperPositionMode','manual'); + else % impl == 'octave' % octave 4.x + set(gcf,'PaperSize', [12 21.5]); + set(gcf,'PaperPositionMode','auto'); + end + set(gcf,'PaperOrientation','landscape'); +end + + +% Iterate over the list of datatype-specific operation names. +for opi = 1:n_opsupnames +%for opi = 1:1 + + % Grab the current datatype combination. + opsupname = opsupnames( opi, : ); + opname = opnames( opi, : ); + + opsupname = strtrim( opsupname ); + opname = strtrim( opname ); + + str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str); + + % Construct filenames for the data files from templates. + file_blissup = sprintf( filetemp_blissup, dirpath, thr_str, opsupname ); + file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname ); + file_eigen = sprintf( filetemp_eigen, dirpath, thr_str, opsupname ); + file_open = sprintf( filetemp_open, dirpath, thr_str, opsupname ); + file_vend = sprintf( filetemp_vend, dirpath, thr_str, opsupname ); + + % Load the data files. + %str = sprintf( ' Loading %s', file_blissup ); disp(str); + run( file_blissup ) + run( file_blislpab ) + run( file_eigen ) + run( file_open ) + run( file_vend ) + + % Construct variable names for the variables in the data files. + var_blissup = sprintf( vartemp, thr_str, opname, 'blissup' ); + var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' ); + var_eigen = sprintf( vartemp, thr_str, opname, 'eigen' ); + var_open = sprintf( vartemp, thr_str, opname, 'openblas' ); + var_vend = sprintf( vartemp, thr_str, opname, 'vendor' ); + + % Use eval() to instantiate the variable names constructed above, + % copying each to a simplified name. + data_blissup = eval( var_blissup ); % e.g. data_st_dgemm_blissup( :, : ); + data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : ); + data_eigen = eval( var_eigen ); % e.g. data_st_dgemm_eigen( :, : ); + data_open = eval( var_open ); % e.g. data_st_dgemm_openblas( :, : ); + data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : ); + + %str = sprintf( ' Reading %s', var_blissup ); disp(str); + %str = sprintf( ' Reading %s', var_blislpab ); disp(str); + %str = sprintf( ' Reading %s', var_eigen ); disp(str); + %str = sprintf( ' Reading %s', var_open ); disp(str); + %str = sprintf( ' Reading %s', var_bfeo ); disp(str); + %str = sprintf( ' Reading %s', var_xsmm ); disp(str); + %str = sprintf( ' Reading %s', var_vend ); disp(str); + + % Plot one result in an m x n grid of plots, via the subplot() + % function. + if 1 == 1 + plot_l3sup_perf( opsupname, ... + data_blissup, ... + data_blislpab, ... + data_eigen, ... + data_open, ... + data_vend, vend_str, ... + nth, ... + 4, 7, ... + cfreq, ... + dflopspercycle, ... + opi, impl ); + + clear data_mt_*gemm_*; + clear data_blissup; + clear data_blislpab; + clear data_eigen; + clear data_open; + clear data_vend; + + end + +end + +% Construct the name of the file to which we will output the graph. +outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth ); + +% Output the graph to pdf format. +%print(gcf, 'gemm_md','-fillpage','-dpdf'); +%print(gcf, outfile,'-bestfit','-dpdf'); +if impl == 'octave' +print(gcf, outfile); +else % if impl == 'matlab' +print(gcf, outfile,'-bestfit','-dpdf'); +end + diff --git a/test/supmt/octave/runthese.m b/test/supmt/octave/runthese.m new file mode 100644 index 000000000..5946d4796 --- /dev/null +++ b/test/supmt/octave/runthese.m @@ -0,0 +1,12 @@ + +% haswell +plot_panel_trxsh(3.25,16,1,'mt','d','ccc',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.25,16,1,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; + +% kabylake +plot_panel_trxsh(3.80,16,1,'mt','d','rrr',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.80,16,1,'mt','d','ccc',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all; + +% epyc +plot_panel_trxsh(3.00, 8,1,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.00, 8,1,'mt','d','ccc',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all; diff --git a/test/supmt/runme.sh b/test/supmt/runme.sh new file mode 100755 index 000000000..e878d76b0 --- /dev/null +++ b/test/supmt/runme.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +# File pefixes. +exec_root="test" +out_root="output" + +sys="blis" +#sys="lonestar5" +#sys="ul252" +#sys="ul264" + +if [ ${sys} = "blis" ]; then + + export GOMP_CPU_AFFINITY="0-3" + nt=4 + +elif [ ${sys} = "lonestar5" ]; then + + export GOMP_CPU_AFFINITY="0-23" + nt=24 + +elif [ ${sys} = "ul252" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0-51" + nt=52 + +elif [ ${sys} = "ul264" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0-63" + nt=64 + +fi + +# Delay between test cases. +delay=0.02 + +# Threadedness to test. +threads="mt" + +# Datatypes to test. +#dts="d s" +dts="d" + +# Operations to test. +ops="gemm" + +# Transpose combintions to test. +trans="nn nt tn tt" + +# Storage combinations to test. +#stors="rrr rrc rcr rcc crr crc ccr ccc" +stors="rrr ccc" + +# Problem shapes to test. +shapes="sll lsl lls lss sls ssl lll" + +# FGVZ: figure out how to probe what's in the directory and +# execute everything that's there? +sms="6" +sns="8" +sks="10" + +# Implementations to test. +impls="vendor blissup blislpab openblas eigen" +#impls="vendor" +#impls="blissup" +#impls="blislpab" +#impls="openblas" +#impls="eigen" + +# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can +# restore the value. +GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY} + +# Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x + +for th in ${threads}; do + + for dt in ${dts}; do + + for op in ${ops}; do + + for tr in ${trans}; do + + for st in ${stors}; do + + for sh in ${shapes}; do + + for sm in ${sms}; do + + for sn in ${sns}; do + + for sk in ${sks}; do + + for im in ${impls}; do + + if [ "${im:0:4}" = "blis" ]; then + unset OMP_NUM_THREADS + export BLIS_NUM_THREADS=${nt} + elif [ "${im}" = "openblas" ]; then + unset OMP_NUM_THREADS + export OPENBLAS_NUM_THREADS=${nt} + elif [ "${im}" = "eigen" ]; then + export OMP_NUM_THREADS=${nt} + elif [ "${im}" = "vendor" ]; then + unset OMP_NUM_THREADS + export MKL_NUM_THREADS=${nt} + fi + + # Multithreaded OpenBLAS seems to have a problem + # running properly if GOMP_CPU_AFFINITY is set. + # So we temporarily unset it here if we are about + # to execute OpenBLAS, but otherwise restore it. + if [ ${im} = "openblas" ]; then + unset GOMP_CPU_AFFINITY + else + export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}" + fi + + # Limit execution of non-BLIS implementations to + # rrr/ccc storage cases. + if [ "${im:0:4}" != "blis" ] && \ + [ "${st}" != "rrr" ] && \ + [ "${st}" != "ccc" ]; then + continue; + fi + + # Further limit execution of libxsmm to + # ccc storage cases. + if [ "${im:0:7}" = "libxsmm" ] && \ + [ "${st}" != "ccc" ]; then + continue; + fi + + # Extract the shape chars for m, n, k. + chm=${sh:0:1} + chn=${sh:1:1} + chk=${sh:2:1} + + # Construct the shape substring (e.g. m6npkp) + shstr="" + + if [ ${chm} = "s" ]; then + shstr="${shstr}m${sm}" + else + shstr="${shstr}mp" + fi + + if [ ${chn} = "s" ]; then + shstr="${shstr}n${sn}" + else + shstr="${shstr}np" + fi + + if [ ${chk} = "s" ]; then + shstr="${shstr}k${sk}" + else + shstr="${shstr}kp" + fi + + # Ex: test_dgemm_nn_rrc_m6npkp_blissup_st.x + + # Construct the name of the test executable. + exec_name="${exec_root}_${dt}${op}_${tr}_${st}_${shstr}_${im}_${th}.x" + + # Construct the name of the output file. + out_file="${out_root}_${th}_${dt}${op}_${tr}_${st}_${shstr}_${im}.m" + + echo "Running (nt = ${nt}) ./${exec_name} > ${out_file}" + + # Run executable. + ./${exec_name} > ${out_file} + + sleep ${delay} + + done + done + done + done + done + done + done + done + done +done + diff --git a/test/supmt/test_gemm.c b/test/supmt/test_gemm.c new file mode 100644 index 000000000..95e9d45b2 --- /dev/null +++ b/test/supmt/test_gemm.c @@ -0,0 +1,589 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#ifdef EIGEN + #define BLIS_DISABLE_BLAS_DEFS + #include "blis.h" + #include + //#include + using namespace Eigen; +#else + #include "blis.h" +#endif + +//#define PRINT + +int main( int argc, char** argv ) +{ + rntm_t rntm_g; + + bli_init(); + + // Copy the global rntm_t object in case we need it later when disabling + // sup. + bli_rntm_init_from_global( &rntm_g ); + +#ifndef ERROR_CHECK + bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); +#endif + + + dim_t n_trials = N_TRIALS; + + num_t dt = DT; + +#if 1 + dim_t p_begin = P_BEGIN; + dim_t p_max = P_MAX; + dim_t p_inc = P_INC; +#else + dim_t p_begin = 4; + dim_t p_max = 40; + dim_t p_inc = 4; +#endif + +#if 1 + dim_t m_input = M_DIM; + dim_t n_input = N_DIM; + dim_t k_input = K_DIM; +#else + p_begin = p_inc = 32; + dim_t m_input = 6; + dim_t n_input = -1; + dim_t k_input = -1; +#endif + +#if 1 + trans_t transa = TRANSA; + trans_t transb = TRANSB; +#else + trans_t transa = BLIS_NO_TRANSPOSE; + trans_t transb = BLIS_NO_TRANSPOSE; +#endif + +#if 1 + stor3_t sc = STOR3; +#else + stor3_t sc = BLIS_RRR; +#endif + + + inc_t rs_c, cs_c; + inc_t rs_a, cs_a; + inc_t rs_b, cs_b; + + if ( sc == BLIS_RRR ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = -1; } + else if ( sc == BLIS_RRC ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = 0; } + else if ( sc == BLIS_RCR ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = -1; } + else if ( sc == BLIS_RCC ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = 0; } + else if ( sc == BLIS_CRR ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = -1; } + else if ( sc == BLIS_CRC ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = 0; } + else if ( sc == BLIS_CCR ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = -1; } + else if ( sc == BLIS_CCC ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = 0; } + else { bli_abort(); } + + f77_int cbla_storage; + + if ( sc == BLIS_RRR ) cbla_storage = CblasRowMajor; + else if ( sc == BLIS_CCC ) cbla_storage = CblasColMajor; + else cbla_storage = -1; + + ( void )cbla_storage; + + + char dt_ch; + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + + f77_char f77_transa; + f77_char f77_transb; + char transal, transbl; + + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); + + transal = tolower( f77_transa ); + transbl = tolower( f77_transb ); + + f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans ); + f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans ); + + ( void )cbla_transa; + ( void )cbla_transb; + + dim_t p; + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; + + printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch, + transal, transbl, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) + { + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n, k; + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p / ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt, m, n, rs_c, cs_c, &c ); + bli_obj_create( dt, m, n, rs_c, cs_c, &c_save ); + + if ( bli_does_notrans( transa ) ) + bli_obj_create( dt, m, k, rs_a, cs_a, &a ); + else + bli_obj_create( dt, k, m, rs_a, cs_a, &a ); + + if ( bli_does_notrans( transb ) ) + bli_obj_create( dt, k, n, rs_b, cs_b, &b ); + else + bli_obj_create( dt, n, k, rs_b, cs_b, &b ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + bli_setsc( (1.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + bli_copym( &c, &c_save ); + +#ifdef EIGEN + double alpha_r, alpha_i; + + bli_getsc( &alpha, &alpha_r, &alpha_i ); + + void* ap = bli_obj_buffer_at_off( &a ); + void* bp = bli_obj_buffer_at_off( &b ); + void* cp = bli_obj_buffer_at_off( &c ); + + const int os_a = ( bli_obj_is_col_stored( &a ) ? bli_obj_col_stride( &a ) + : bli_obj_row_stride( &a ) ); + const int os_b = ( bli_obj_is_col_stored( &b ) ? bli_obj_col_stride( &b ) + : bli_obj_row_stride( &b ) ); + const int os_c = ( bli_obj_is_col_stored( &c ) ? bli_obj_col_stride( &c ) + : bli_obj_row_stride( &c ) ); + + Stride stride_a( os_a, 1 ); + Stride stride_b( os_b, 1 ); + Stride stride_c( os_c, 1 ); + + #if defined(IS_FLOAT) + #elif defined (IS_DOUBLE) + #ifdef A_STOR_R + typedef Matrix MatrixXd_A; + #else + typedef Matrix MatrixXd_A; + #endif + #ifdef B_STOR_R + typedef Matrix MatrixXd_B; + #else + typedef Matrix MatrixXd_B; + #endif + #ifdef C_STOR_R + typedef Matrix MatrixXd_C; + #else + typedef Matrix MatrixXd_C; + #endif + + #ifdef A_NOTRANS // A is not transposed + Map > A( ( double* )ap, m, k, stride_a ); + #else // A is transposed + Map > A( ( double* )ap, k, m, stride_a ); + #endif + + #ifdef B_NOTRANS // B is not transposed + Map > B( ( double* )bp, k, n, stride_b ); + #else // B is transposed + Map > B( ( double* )bp, n, k, stride_b ); + #endif + + Map > C( ( double* )cp, m, n, stride_c ); + #endif +#endif + + + double dtime_save = DBL_MAX; + + for ( dim_t r = 0; r < n_trials; ++r ) + { + bli_copym( &c_save, &c ); + + + double dtime = bli_clock(); + + +#ifdef EIGEN + + #ifdef A_NOTRANS + #ifdef B_NOTRANS + C.noalias() += alpha_r * A * B; + #else // B_TRANS + C.noalias() += alpha_r * A * B.transpose(); + #endif + #else // A_TRANS + #ifdef B_NOTRANS + C.noalias() += alpha_r * A.transpose() * B; + #else // B_TRANS + C.noalias() += alpha_r * A.transpose() * B.transpose(); + #endif + #endif + +#endif +#ifdef BLIS + #ifdef SUP + // Allow sup. + bli_gemm( &alpha, + &a, + &b, + &beta, + &c ); + #else + // Disable sup and use the expert interface. + //rntm_t rntm = BLIS_RNTM_INITIALIZER; + rntm_t rntm = rntm_g; + bli_rntm_disable_l3_sup( &rntm ); + + bli_gemm_ex( &alpha, + &a, + &b, + &beta, + &c, NULL, &rntm ); + #endif +#endif +#ifdef BLAS + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = ( float* )bli_obj_buffer( &alpha ); + float* ap = ( float* )bli_obj_buffer( &a ); + float* bp = ( float* )bli_obj_buffer( &b ); + float* betap = ( float* )bli_obj_buffer( &beta ); + float* cp = ( float* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_sgemm( &f77_transa, + #else + sgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = ( double* )bli_obj_buffer( &alpha ); + double* ap = ( double* )bli_obj_buffer( &a ); + double* bp = ( double* )bli_obj_buffer( &b ); + double* betap = ( double* )bli_obj_buffer( &beta ); + double* cp = ( double* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_dgemm( &f77_transa, + #else + dgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha ); + scomplex* ap = ( scomplex* )bli_obj_buffer( &a ); + scomplex* bp = ( scomplex* )bli_obj_buffer( &b ); + scomplex* betap = ( scomplex* )bli_obj_buffer( &beta ); + scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_cgemm( &f77_transa, + #else + cgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha ); + dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a ); + dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b ); + dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta ); + dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_zgemm( &f77_transa, + #else + zgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +#endif +#ifdef CBLAS + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* bp = bli_obj_buffer( &b ); + float* betap = bli_obj_buffer( &beta ); + float* cp = bli_obj_buffer( &c ); + + cblas_sgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + *alphap, + ap, lda, + bp, ldb, + *betap, + cp, ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* bp = bli_obj_buffer( &b ); + double* betap = bli_obj_buffer( &beta ); + double* cp = bli_obj_buffer( &c ); + + cblas_dgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + *alphap, + ap, lda, + bp, ldb, + *betap, + cp, ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* bp = bli_obj_buffer( &b ); + scomplex* betap = bli_obj_buffer( &beta ); + scomplex* cp = bli_obj_buffer( &c ); + + cblas_cgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + alphap, + ap, lda, + bp, ldb, + betap, + cp, ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* bp = bli_obj_buffer( &b ); + dcomplex* betap = bli_obj_buffer( &beta ); + dcomplex* cp = bli_obj_buffer( &c ); + + cblas_zgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + alphap, + ap, lda, + bp, ldb, + betap, + cp, ldc ); + } +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + double gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + + printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch, + transal, transbl, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, + ( unsigned long )k, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} +