diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index c2d6b4076..26030bb2d 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -42,7 +42,6 @@ #include "bli_packm_unb_var1.h" #include "bli_packm_blk_var1.h" -#include "bli_packm_blk_var2.h" #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_4mi.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 78d52c9ca..3c0318bad 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -62,6 +62,9 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); extern func_t* packm_struc_cxk_kers; +extern func_t* packm_struc_cxk_4mi_kers; +extern func_t* packm_struc_cxk_3mis_kers; +extern func_t* packm_struc_cxk_rih_kers; void bli_packm_blk_var1( obj_t* c, @@ -96,6 +99,8 @@ void bli_packm_blk_var1( obj_t* c, dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); + obj_t kappa; + obj_t* kappa_p; void* buf_kappa; func_t* packm_kers; @@ -103,14 +108,61 @@ void bli_packm_blk_var1( obj_t* c, FUNCPTR_T f; - // This variant assumes that the micro-kernel will always apply the - // alpha scalar of the higher-level operation. Thus, we use BLIS_ONE - // for kappa so that the underlying packm implementation does not - // scale during packing. - buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); + // Treatment of kappa (ie: packing during scaling) depends on + // whether we are executing an induced method. + if ( bli_is_ind_packed( schema ) ) + { + // The value for kappa we use will depend on whether the scalar + // attached to A has a nonzero imaginary component. If it does, + // then we will apply the scalar during packing to facilitate + // implementing induced complex domain algorithms in terms of + // real domain micro-kernels. (In the aforementioned situation, + // applying a real scalar is easy, but applying a complex one is + // harder, so we avoid the need altogether with the code below.) + if( thread_am_ochief( t ) ) + { + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); + + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); + + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + } + kappa_p = thread_obroadcast( t, kappa_p ); + + // Acquire the buffer to the kappa chosen above. + buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); + } + else // if ( bli_is_nat_packed( schema ) ) + { + // This branch if for native execution, where we assume that + // the micro-kernel will always apply the alpha scalar of the + // higher-level operation. Thus, we use BLIS_ONE for kappa so + // that the underlying packm implementation does not perform + // any scaling during packing. + buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); + } - // Choose the correct func_t object. - packm_kers = packm_struc_cxk_kers; + + // Choose the correct func_t object based on the pack_t schema. + if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; + else if ( bli_is_3mi_packed( schema ) || + bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers; + else if ( bli_is_ro_packed( schema ) || + bli_is_io_packed( schema ) || + bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; + else packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_obj_query( dt_cp, packm_kers ); @@ -144,8 +196,8 @@ void bli_packm_blk_var1( obj_t* c, } -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kertype ) \ +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kertype ) \ \ void PASTEMAC(ch,varname)( \ struc_t strucc, \ @@ -204,6 +256,9 @@ void PASTEMAC(ch,varname)( \ conj_t conjc; \ bool_t row_stored; \ bool_t col_stored; \ + inc_t is_p_use; \ + dim_t ss_num; \ + dim_t ss_den; \ \ ctype* restrict c_use; \ ctype* restrict p_use; \ @@ -274,6 +329,17 @@ void PASTEMAC(ch,varname)( \ m_panel_max = &panel_dim_max; \ n_panel_max = &panel_len_max_i; \ } \ +\ + /* Compute the storage stride scaling. Usually this is just 1. However, + in the case of interleaved 3m, we need to scale by 3/2, and in the + cases of real-only, imag-only, or summed-only, we need to scale by + 1/2. In both cases, we are compensating for the fact that pointer + arithmetic occurs in terms of complex elements rather than real + elements. */ \ + if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \ + else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ + else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ + else { ss_num = 1; ss_den = 1; } \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ @@ -297,6 +363,15 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ +\ +/* +if ( row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ + c_cast, rs_c, cs_c, "%4.1f", "" ); \ +if ( col_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ + c_cast, rs_c, cs_c, "%4.1f", "" ); \ +*/ \ \ for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ @@ -353,6 +428,15 @@ void PASTEMAC(ch,varname)( \ \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ +\ + /* We need to re-compute the imaginary stride as a function of + panel_len_max_i since triangular packed matrices have panels + of varying lengths. NOTE: This imaginary stride value is + only referenced by the packm kernels for induced methods. */ \ + is_p_use = ldp * panel_len_max_i; \ +\ + /* We nudge the imaginary stride up by one if it is odd. */ \ + is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -370,25 +454,27 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p, \ - is_p ); \ + is_p_use ); \ } \ \ /* NOTE: This value is usually LESS than ps_p because triangular matrices usually have several micro-panels that are shorter than a "full" micro-panel. */ \ - p_inc = ldp * panel_len_max_i; \ -\ - /* We nudge the panel increment up by one if it is odd. */ \ - p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \ + p_inc = ( is_p_use * ss_num ) / ss_den; \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ /* This case executes if the panel belongs to a Hermitian or symmetric matrix, which includes stored, unstored, and diagonal-intersecting panels. */ \ +\ + c_use = c_begin; \ + p_use = p_begin; \ \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ +\ + is_p_use = is_p; \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -404,13 +490,11 @@ void PASTEMAC(ch,varname)( \ *m_panel_max, \ *n_panel_max, \ kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p, \ - is_p ); \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p, \ + is_p_use ); \ } \ \ - /* NOTE: This value is equivalent to ps_p. */ \ - /*p_inc = ldp * panel_len_max_i;*/ \ p_inc = ps_p; \ } \ else \ @@ -418,9 +502,14 @@ void PASTEMAC(ch,varname)( \ /* This case executes if the panel is general, or, if the panel is part of a triangular matrix and is neither unstored (ie: zero) nor diagonal-intersecting. */ \ +\ + c_use = c_begin; \ + p_use = p_begin; \ \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ +\ + is_p_use = is_p; \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -436,28 +525,81 @@ void PASTEMAC(ch,varname)( \ *m_panel_max, \ *n_panel_max, \ kappa_cast, \ - c_begin, rs_c, cs_c, \ - p_begin, rs_p, cs_p, \ - is_p ); \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p, \ + is_p_use ); \ } \ -/* - if ( row_stored ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \ - p_begin, rs_p, cs_p, "%9.2e", "" ); \ - else if ( col_stored ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \ - p_begin, rs_p, cs_p, "%9.2e", "" ); \ -*/ \ \ /* NOTE: This value is equivalent to ps_p. */ \ - /*p_inc = ldp * panel_len_max_i;*/ \ p_inc = ps_p; \ } \ \ +/* + if ( bli_is_4mi_packed( schema ) ) { \ + printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ + if ( col_stored ) { \ + if ( 0 ) \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \ + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ + } \ + if ( row_stored ) { \ + if ( 0 ) \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \ + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ + } \ + } \ +*/ \ +/* +*/ \ +\ +/* +*/ \ +/* + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ +\ +\ +/* + if ( row_stored ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \ + (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + inc_t is_b = rs_p * *m_panel_max; \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +\ +\ +/* + if ( col_stored ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \ + (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ \ p_begin += p_inc; \ +\ } \ } -INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t ) +INSERT_GENTFUNCR_BASIC( packm_blk_var1, packm_ker_t ) diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var1.c.old similarity index 67% rename from frame/1m/packm/bli_packm_blk_var2.c rename to frame/1m/packm/bli_packm_blk_var1.c.old index c53c8e6bc..78d52c9ca 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var1.c.old @@ -59,15 +59,12 @@ typedef void (*FUNCPTR_T)( packm_thrinfo_t* thread ); -//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2); +static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); extern func_t* packm_struc_cxk_kers; -extern func_t* packm_struc_cxk_4mi_kers; -extern func_t* packm_struc_cxk_3mis_kers; -extern func_t* packm_struc_cxk_rih_kers; -void bli_packm_blk_var2( obj_t* c, +void bli_packm_blk_var1( obj_t* c, obj_t* p, packm_thrinfo_t* t ) { @@ -99,8 +96,6 @@ void bli_packm_blk_var2( obj_t* c, dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); - obj_t kappa; - obj_t* kappa_p; void* buf_kappa; func_t* packm_kers; @@ -108,59 +103,14 @@ void bli_packm_blk_var2( obj_t* c, FUNCPTR_T f; + // This variant assumes that the micro-kernel will always apply the + // alpha scalar of the higher-level operation. Thus, we use BLIS_ONE + // for kappa so that the underlying packm implementation does not + // scale during packing. + buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); -/* - // We want this variant to behave identically to that of variant 1 - // in the real domain. - if ( bli_is_real( dt_cp ) ) - { - bli_packm_blk_var1( c, p, t ); - return; - } -*/ - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if( thread_am_ochief( t ) ) - { - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } - } - kappa_p = thread_obroadcast( t, kappa_p ); - - - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); - - - // Choose the correct func_t object based on the pack_t schema. - if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; - else if ( bli_is_3mi_packed( schema ) || - bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers; - else if ( bli_is_ro_packed( schema ) || - bli_is_io_packed( schema ) || - bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; - else packm_kers = packm_struc_cxk_kers; + // Choose the correct func_t object. + packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_obj_query( dt_cp, packm_kers ); @@ -168,9 +118,7 @@ void bli_packm_blk_var2( obj_t* c, // Index into the type combination array to extract the correct // function pointer. - //f = ftypes[dt_cp]; - if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var2; - else f = bli_zpackm_blk_var2; + f = ftypes[dt_cp]; // Invoke the function. f( strucc, @@ -196,8 +144,8 @@ void bli_packm_blk_var2( obj_t* c, } -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kertype ) \ +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, kertype ) \ \ void PASTEMAC(ch,varname)( \ struc_t strucc, \ @@ -256,9 +204,6 @@ void PASTEMAC(ch,varname)( \ conj_t conjc; \ bool_t row_stored; \ bool_t col_stored; \ - inc_t is_p_use; \ - dim_t ss_num; \ - dim_t ss_den; \ \ ctype* restrict c_use; \ ctype* restrict p_use; \ @@ -329,17 +274,6 @@ void PASTEMAC(ch,varname)( \ m_panel_max = &panel_dim_max; \ n_panel_max = &panel_len_max_i; \ } \ -\ - /* Compute the storage stride scaling. Usually this is just 1. However, - in the case of interleaved 3m, we need to scale by 3/2, and in the - cases of real-only, imag-only, or summed-only, we need to scale by - 1/2. In both cases, we are compensating for the fact that pointer - arithmetic occurs in terms of complex elements rather than real - elements. */ \ - if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \ - else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ - else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ - else { ss_num = 1; ss_den = 1; } \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ @@ -363,15 +297,6 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ -\ -/* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -if ( col_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -*/ \ \ for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ @@ -428,14 +353,6 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ -\ - /* We need to re-compute the imaginary stride as a function of - panel_len_max_i since triangular packed matrices have panels - of varying lengths. */ \ - is_p_use = ldp * panel_len_max_i; \ -\ - /* We nudge the imaginary stride up by one if it is odd. */ \ - is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -453,31 +370,25 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p, \ - is_p_use ); \ + is_p ); \ } \ \ /* NOTE: This value is usually LESS than ps_p because triangular matrices usually have several micro-panels that are shorter than a "full" micro-panel. */ \ -/* p_inc = ldp * panel_len_max_i; \ +\ + /* We nudge the panel increment up by one if it is odd. */ \ p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \ -*/ \ - p_inc = ( is_p_use * ss_num ) / ss_den; \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ /* This case executes if the panel belongs to a Hermitian or symmetric matrix, which includes stored, unstored, and diagonal-intersecting panels. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -493,12 +404,13 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ *m_panel_max, \ *n_panel_max, \ kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use ); \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p, \ + is_p ); \ } \ \ /* NOTE: This value is equivalent to ps_p. */ \ + /*p_inc = ldp * panel_len_max_i;*/ \ p_inc = ps_p; \ } \ else \ @@ -506,14 +418,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ /* This case executes if the panel is general, or, if the panel is part of a triangular matrix and is neither unstored (ie: zero) nor diagonal-intersecting. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -529,81 +436,28 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ *m_panel_max, \ *n_panel_max, \ kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use ); \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p, \ + is_p ); \ } \ +/* + if ( row_stored ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \ + p_begin, rs_p, cs_p, "%9.2e", "" ); \ + else if ( col_stored ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \ + p_begin, rs_p, cs_p, "%9.2e", "" ); \ +*/ \ \ /* NOTE: This value is equivalent to ps_p. */ \ + /*p_inc = ldp * panel_len_max_i;*/ \ p_inc = ps_p; \ } \ \ -/* - if ( bli_is_4mi_packed( schema ) ) { \ - printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ - if ( col_stored ) { \ - if ( 0 ) \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ - } \ - if ( row_stored ) { \ - if ( 0 ) \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ - } \ - } \ -*/ \ -/* -*/ \ -\ -/* -*/ \ -/* - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ -\ -\ -/* - if ( row_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - inc_t is_b = rs_p * *m_panel_max; \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ -\ -\ -/* - if ( col_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ \ p_begin += p_inc; \ -\ } \ } -INSERT_GENTFUNCCO_BASIC( packm_blk_var2, packm_ker_t ) +INSERT_GENTFUNC_BASIC( packm_blk_var1, packm_ker_t ) diff --git a/frame/1m/packm/bli_packm_blk_var2.h b/frame/1m/packm/bli_packm_blk_var2.h deleted file mode 100644 index 84e7b1cb5..000000000 --- a/frame/1m/packm/bli_packm_blk_var2.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_packm_blk_var2( obj_t* c, - obj_t* p, - packm_thrinfo_t* t ); - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool_t invdiag, \ - bool_t revifup, \ - bool_t reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void* packm_ker, \ - packm_thrinfo_t* t \ - ); - -INSERT_GENTPROTCO_BASIC( packm_blk_var2 ) - diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 2a8287109..c90c0eaa5 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -44,7 +44,7 @@ static FUNCPTR_T vars[6][3] = { // unblocked optimized unblocked blocked { bli_packm_unb_var1, NULL, bli_packm_blk_var1 }, - { NULL, NULL, bli_packm_blk_var2 }, + { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 6be16bc0c..137fa4184 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -654,6 +654,14 @@ bli_is_io_packed( schema ) || \ bli_is_rpi_packed( schema ) ) +#define bli_is_nat_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ) + +#define bli_is_ind_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ) + // pointer-related @@ -668,9 +676,6 @@ } - - - // return datatype for char #define bli_stype ( BLIS_FLOAT ) diff --git a/frame/ind/cntl/bli_gemm3m1_cntl.c b/frame/ind/cntl/bli_gemm3m1_cntl.c index ab9c98e7e..909aedfd5 100644 --- a/frame/ind/cntl/bli_gemm3m1_cntl.c +++ b/frame/ind/cntl/bli_gemm3m1_cntl.c @@ -138,7 +138,7 @@ void bli_gemm3m1_cntl_init() gemm3m1_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3m1_mr, gemm3m1_kr, FALSE, // do NOT invert diagonal @@ -150,7 +150,7 @@ void bli_gemm3m1_cntl_init() gemm3m1_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3m1_kr, gemm3m1_nr, FALSE, // do NOT invert diagonal diff --git a/frame/ind/cntl/bli_gemm3m2_cntl.c b/frame/ind/cntl/bli_gemm3m2_cntl.c index 4b80bf69f..7e362473c 100644 --- a/frame/ind/cntl/bli_gemm3m2_cntl.c +++ b/frame/ind/cntl/bli_gemm3m2_cntl.c @@ -146,7 +146,7 @@ void bli_gemm3m2_cntl_init() gemm3m2_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3m2_mr, gemm3m2_kr, FALSE, // do NOT invert diagonal @@ -158,7 +158,7 @@ void bli_gemm3m2_cntl_init() gemm3m2_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3m2_kr, gemm3m2_nr, FALSE, // do NOT invert diagonal diff --git a/frame/ind/cntl/bli_gemm3m3_cntl.c b/frame/ind/cntl/bli_gemm3m3_cntl.c index f2193ca9f..e444067fd 100644 --- a/frame/ind/cntl/bli_gemm3m3_cntl.c +++ b/frame/ind/cntl/bli_gemm3m3_cntl.c @@ -144,7 +144,7 @@ void bli_gemm3m3_cntl_init() gemm3m3_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3m3_kr, gemm3m3_nr, FALSE, // do NOT invert diagonal diff --git a/frame/ind/cntl/bli_gemm3mh_cntl.c b/frame/ind/cntl/bli_gemm3mh_cntl.c index 8e0b28a1d..8f108fa16 100644 --- a/frame/ind/cntl/bli_gemm3mh_cntl.c +++ b/frame/ind/cntl/bli_gemm3mh_cntl.c @@ -143,7 +143,7 @@ void bli_gemm3mh_cntl_init() gemm3mh_packa_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3mh_mr, gemm3mh_kr, FALSE, // do NOT invert diagonal @@ -155,7 +155,7 @@ void bli_gemm3mh_cntl_init() gemm3mh_packb_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3mh_kr, gemm3mh_nr, FALSE, // do NOT invert diagonal @@ -168,7 +168,7 @@ void bli_gemm3mh_cntl_init() gemm3mh_packa_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3mh_mr, gemm3mh_kr, FALSE, // do NOT invert diagonal @@ -180,7 +180,7 @@ void bli_gemm3mh_cntl_init() gemm3mh_packb_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3mh_kr, gemm3mh_nr, FALSE, // do NOT invert diagonal @@ -193,7 +193,7 @@ void bli_gemm3mh_cntl_init() gemm3mh_packa_cntl_rpi = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3mh_mr, gemm3mh_kr, FALSE, // do NOT invert diagonal @@ -205,7 +205,7 @@ void bli_gemm3mh_cntl_init() gemm3mh_packb_cntl_rpi = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3mh_kr, gemm3mh_nr, FALSE, // do NOT invert diagonal diff --git a/frame/ind/cntl/bli_gemm4m1_cntl.c b/frame/ind/cntl/bli_gemm4m1_cntl.c index aa2bfa303..3fb517d52 100644 --- a/frame/ind/cntl/bli_gemm4m1_cntl.c +++ b/frame/ind/cntl/bli_gemm4m1_cntl.c @@ -135,7 +135,7 @@ void bli_gemm4m1_cntl_init() gemm4m1_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4m1_mr, gemm4m1_kr, FALSE, // do NOT invert diagonal @@ -147,7 +147,7 @@ void bli_gemm4m1_cntl_init() gemm4m1_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4m1_kr, gemm4m1_nr, FALSE, // do NOT invert diagonal diff --git a/frame/ind/cntl/bli_gemm4mb_cntl.c b/frame/ind/cntl/bli_gemm4mb_cntl.c index 4dddc69e9..c59ed3a98 100644 --- a/frame/ind/cntl/bli_gemm4mb_cntl.c +++ b/frame/ind/cntl/bli_gemm4mb_cntl.c @@ -137,7 +137,7 @@ void bli_gemm4mb_cntl_init() gemm4mb_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4mb_mr, gemm4mb_kr, FALSE, // do NOT invert diagonal @@ -149,7 +149,7 @@ void bli_gemm4mb_cntl_init() gemm4mb_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4mb_kr, gemm4mb_nr, FALSE, // do NOT invert diagonal diff --git a/frame/ind/cntl/bli_gemm4mh_cntl.c b/frame/ind/cntl/bli_gemm4mh_cntl.c index 2ce643fca..2deb4ee09 100644 --- a/frame/ind/cntl/bli_gemm4mh_cntl.c +++ b/frame/ind/cntl/bli_gemm4mh_cntl.c @@ -145,7 +145,7 @@ void bli_gemm4mh_cntl_init() gemm4mh_packa_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal @@ -157,7 +157,7 @@ void bli_gemm4mh_cntl_init() gemm4mh_packb_cntl_ro = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal @@ -170,7 +170,7 @@ void bli_gemm4mh_cntl_init() gemm4mh_packa_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4mh_mr, gemm4mh_kr, FALSE, // do NOT invert diagonal @@ -182,7 +182,7 @@ void bli_gemm4mh_cntl_init() gemm4mh_packb_cntl_io = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4mh_kr, gemm4mh_nr, FALSE, // do NOT invert diagonal diff --git a/frame/ind/cntl/bli_trsm3m1_cntl.c b/frame/ind/cntl/bli_trsm3m1_cntl.c index 642d9317f..5c88cd688 100644 --- a/frame/ind/cntl/bli_trsm3m1_cntl.c +++ b/frame/ind/cntl/bli_trsm3m1_cntl.c @@ -112,7 +112,7 @@ void bli_trsm3m1_cntl_init() trsm3m1_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases gemm3m1_mr, @@ -126,7 +126,7 @@ void bli_trsm3m1_cntl_init() trsm3m1_l_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, // IMPORTANT: m dim multiple must be mr since // B_pack is updated (ie: serves as C) in trsm gemm3m1_mr, @@ -141,7 +141,7 @@ void bli_trsm3m1_cntl_init() trsm3m1_r_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3m1_nr, gemm3m1_mr, FALSE, // do NOT invert diagonal @@ -153,7 +153,7 @@ void bli_trsm3m1_cntl_init() trsm3m1_r_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm3m1_mr, gemm3m1_mr, TRUE, // invert diagonal diff --git a/frame/ind/cntl/bli_trsm4m1_cntl.c b/frame/ind/cntl/bli_trsm4m1_cntl.c index 191e46f61..54883c42f 100644 --- a/frame/ind/cntl/bli_trsm4m1_cntl.c +++ b/frame/ind/cntl/bli_trsm4m1_cntl.c @@ -112,7 +112,7 @@ void bli_trsm4m1_cntl_init() trsm4m1_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases gemm4m1_mr, @@ -126,7 +126,7 @@ void bli_trsm4m1_cntl_init() trsm4m1_l_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, // IMPORTANT: m dim multiple must be mr since // B_pack is updated (ie: serves as C) in trsm gemm4m1_mr, @@ -141,7 +141,7 @@ void bli_trsm4m1_cntl_init() trsm4m1_r_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4m1_nr, gemm4m1_mr, FALSE, // do NOT invert diagonal @@ -153,7 +153,7 @@ void bli_trsm4m1_cntl_init() trsm4m1_r_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, + BLIS_VARIANT1, gemm4m1_mr, gemm4m1_mr, TRUE, // invert diagonal