From a51e32ec061941cd10119ea80115c82a40b1673f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 30 Jul 2014 10:41:48 -0500 Subject: [PATCH] Fixed unit register blocksize brokenness. Details: - Fixed a breakdown in BLIS's ability to differentiate between row-stored and column-stored micro-panels when MR or NR is unit. When either register blocksize (or both) is equal to one, inspecting the strides of the affected packed micro-panel is no longer sufficient to determine whether the micro-panel is a row-stored column panel or a column-stored row panel (because both strides are unit). At that point, dimension information is necessary when invoking the bli_is_row_stored_f() and bli_is_col_stored_f() macros (and their "obj" counterparts). Thanks to Ilya Polkovnichenko for reporting this bug. - Added panel dimensions (m and n) to obj_t, which are set in packm_init() and then passed into the blocked variants to support the aforementioned update. --- frame/1m/packm/bli_packm_blk_var1.c | 77 ++++++++++++++------- frame/1m/packm/bli_packm_blk_var1.h | 2 + frame/1m/packm/bli_packm_blk_var3.c | 91 ++++++++++++++----------- frame/1m/packm/bli_packm_blk_var3.h | 2 + frame/1m/packm/bli_packm_blk_var4.c | 91 ++++++++++++++----------- frame/1m/packm/bli_packm_blk_var4.h | 2 + frame/1m/packm/bli_packm_gen_cxk.c | 12 ++-- frame/1m/packm/bli_packm_herm_cxk.c | 48 ++++++------- frame/1m/packm/bli_packm_init.c | 11 ++- frame/1m/packm/bli_packm_tri_cxk.c | 12 ++-- frame/1m/unpackm/bli_unpackm_blk_var2.c | 32 +++++---- frame/1m/unpackm/bli_unpackm_blk_var2.h | 2 + frame/include/bli_obj_macro_defs.h | 64 +++++++++-------- frame/include/bli_param_macro_defs.h | 17 +++-- frame/include/bli_type_defs.h | 6 ++ 15 files changed, 273 insertions(+), 196 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index af50d994f..fab263ebf 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)( dim_t n, dim_t m_max, dim_t n_max, + dim_t m_panel, + dim_t n_panel, void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, @@ -78,6 +80,8 @@ void bli_packm_blk_var1( obj_t* c, dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); + dim_t m_panel = bli_obj_panel_length( *p ); + dim_t n_panel = bli_obj_panel_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); @@ -116,11 +120,13 @@ void bli_packm_blk_var1( obj_t* c, n_p, m_max_p, n_max_p, + m_panel, + n_panel, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, pd_p, ps_p, - t ); + t ); } @@ -140,6 +146,8 @@ void PASTEMAC(ch,varname )( \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ + dim_t m_panel, \ + dim_t n_panel, \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ @@ -204,7 +212,7 @@ void PASTEMAC(ch,varname )( \ /* If the strides of P indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panels. */ \ iter_dim = n; \ @@ -213,7 +221,7 @@ void PASTEMAC(ch,varname )( \ panel_dim_max = pd_p; \ ldc = rs_c; \ vs_c = cs_c; \ - diagoffc_inc = -( doff_t)panel_dim_max; \ + diagoffc_inc = -( doff_t )panel_dim_max; \ ldp = rs_p; \ m_panel_full = &m; \ n_panel_full = &panel_dim_i; \ @@ -222,7 +230,7 @@ void PASTEMAC(ch,varname )( \ m_panel_max = &panel_len_max_i; \ n_panel_max = &panel_dim_max; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panels. */ \ iter_dim = m; \ @@ -264,7 +272,7 @@ void PASTEMAC(ch,varname )( \ \ p_begin = p_cast; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ @@ -295,20 +303,20 @@ void PASTEMAC(ch,varname )( \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \ - ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ + if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \ + ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ - if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ panel_off_i = 0; \ panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \ diagoffp_i = diagoffc_i; \ } \ - else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ panel_off_i = bli_abs( diagoffc_i ); \ panel_len_i = panel_len_full - panel_off_i; \ @@ -319,8 +327,8 @@ void PASTEMAC(ch,varname )( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk)( strucc, \ diagoffp_i, \ diagc, \ @@ -334,8 +342,7 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ - }\ -\ + }\ \ p_inc = ldp * panel_len_max_i; \ } \ @@ -348,8 +355,8 @@ void PASTEMAC(ch,varname )( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk)( strucc, \ diagoffc_i, \ uploc, \ @@ -361,7 +368,7 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ - } \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -375,8 +382,19 @@ void PASTEMAC(ch,varname )( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ +/* +printf( "packm_var1: gen case\n" ); \ +printf( "packm_var1: m_panel_use = %d\n", *m_panel_use ); \ +printf( "packm_var1: n_panel_use = %d\n", *n_panel_use ); \ +printf( "packm_var1: m_panel_max = %d\n", *m_panel_max ); \ +printf( "packm_var1: n_panel_max = %d\n", *n_panel_max ); \ +printf( "packm_var1: m_panel = %d\n", m_panel ); \ +printf( "packm_var1: n_panel = %d\n", n_panel ); \ +printf( "packm_var1: rs_c cs_c = %d %d\n", rs_c, cs_c ); \ +printf( "packm_var1: rs_p cs_p = %d %d\n", rs_p, cs_p ); \ +*/ \ PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -388,23 +406,32 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ - } \ +/* + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \ + p_begin, rs_p, cs_p, "%9.2e", "" ); \ + else if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ + PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \ + p_begin, rs_p, cs_p, "%9.2e", "" ); \ +*/ \ + } \ +\ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ - } \ + } \ \ \ - p_begin += p_inc; \ + p_begin += p_inc; \ } \ \ \ /* if ( rs_p == 1 ) \ PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \ - p_begin, rs_p, cs_p, "%4.1f", "" ); \ + p_begin, rs_p, cs_p, "%9.2e", "" ); \ if ( cs_p == 1 ) \ PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \ - p_begin, rs_p, cs_p, "%4.1f", "" ); \ + p_begin, rs_p, cs_p, "%9.2e", "" ); \ */ \ \ } diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index b366edcef..d13120a59 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ + dim_t m_panel, \ + dim_t n_panel, \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index f078ed1b0..23ab95391 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)( dim_t n, dim_t m_max, dim_t n_max, + dim_t m_panel, + dim_t n_panel, void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, @@ -78,6 +80,8 @@ void bli_packm_blk_var3( obj_t* c, dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); + dim_t m_panel = bli_obj_panel_length( *p ); + dim_t n_panel = bli_obj_panel_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); @@ -111,26 +115,27 @@ void bli_packm_blk_var3( obj_t* c, // real domain counterparts. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if ( thread_am_ochief( t ) ) { - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + if ( thread_am_ochief( t ) ) + { + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } - } - kappa_p = thread_obroadcast( t, kappa_p ); + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + } + kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. @@ -156,11 +161,13 @@ void bli_packm_blk_var3( obj_t* c, n_p, m_max_p, n_max_p, + m_panel, + n_panel, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, pd_p, ps_p, - t ); + t ); } @@ -180,6 +187,8 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ + dim_t m_panel, \ + dim_t n_panel, \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ @@ -244,7 +253,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of P indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panels. */ \ iter_dim = n; \ @@ -253,7 +262,7 @@ void PASTEMAC(ch,varname)( \ panel_dim_max = pd_p; \ ldc = rs_c; \ vs_c = cs_c; \ - diagoffc_inc = -( doff_t)panel_dim_max; \ + diagoffc_inc = -( doff_t )panel_dim_max; \ ldp = rs_p; \ m_panel_full = &m; \ n_panel_full = &panel_dim_i; \ @@ -262,7 +271,7 @@ void PASTEMAC(ch,varname)( \ m_panel_max = &panel_len_max_i; \ n_panel_max = &panel_dim_max; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panels. */ \ iter_dim = m; \ @@ -304,8 +313,8 @@ void PASTEMAC(ch,varname)( \ \ p_begin = p_cast; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ @@ -335,20 +344,20 @@ void PASTEMAC(ch,varname)( \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \ - ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ + if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \ + ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ - if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ panel_off_i = 0; \ panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \ diagoffp_i = diagoffc_i; \ } \ - else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ panel_off_i = bli_abs( diagoffc_i ); \ panel_len_i = panel_len_full - panel_off_i; \ @@ -359,8 +368,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \ diagoffp_i, \ diagc, \ @@ -374,7 +383,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ - } \ + } \ \ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ @@ -398,8 +407,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \ diagoffc_i, \ uploc, \ @@ -411,8 +420,8 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ - } \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ } \ @@ -425,8 +434,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -438,11 +447,12 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ - } \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ \ + } \ /* if ( cs_p == 1 ) { \ PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \ @@ -452,9 +462,8 @@ void PASTEMAC(ch,varname)( \ } \ */ \ \ - } \ \ - p_begin += p_inc; \ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var3.h b/frame/1m/packm/bli_packm_blk_var3.h index 5f3b3fde2..0a4fd1f79 100644 --- a/frame/1m/packm/bli_packm_blk_var3.h +++ b/frame/1m/packm/bli_packm_blk_var3.h @@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ + dim_t m_panel, \ + dim_t n_panel, \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index 2aef6001b..5ea3cef38 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)( dim_t n, dim_t m_max, dim_t n_max, + dim_t m_panel, + dim_t n_panel, void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, @@ -78,6 +80,8 @@ void bli_packm_blk_var4( obj_t* c, dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); + dim_t m_panel = bli_obj_panel_length( *p ); + dim_t n_panel = bli_obj_panel_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); @@ -111,26 +115,27 @@ void bli_packm_blk_var4( obj_t* c, // real domain counterparts. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if( thread_am_ochief( t ) ) { - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + if( thread_am_ochief( t ) ) + { + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } - } - kappa_p = thread_obroadcast( t, kappa_p ); + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + } + kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. @@ -156,11 +161,13 @@ void bli_packm_blk_var4( obj_t* c, n_p, m_max_p, n_max_p, + m_panel, + n_panel, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, pd_p, ps_p, - t ); + t ); } @@ -180,6 +187,8 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ + dim_t m_panel, \ + dim_t n_panel, \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ @@ -244,7 +253,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of P indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panels. */ \ iter_dim = n; \ @@ -253,7 +262,7 @@ void PASTEMAC(ch,varname)( \ panel_dim_max = pd_p; \ ldc = rs_c; \ vs_c = cs_c; \ - diagoffc_inc = -( doff_t)panel_dim_max; \ + diagoffc_inc = -( doff_t )panel_dim_max; \ ldp = rs_p; \ m_panel_full = &m; \ n_panel_full = &panel_dim_i; \ @@ -262,7 +271,7 @@ void PASTEMAC(ch,varname)( \ m_panel_max = &panel_len_max_i; \ n_panel_max = &panel_dim_max; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panels. */ \ iter_dim = m; \ @@ -304,8 +313,8 @@ void PASTEMAC(ch,varname)( \ \ p_begin = p_cast; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ @@ -335,20 +344,20 @@ void PASTEMAC(ch,varname)( \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \ - ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ + if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \ + ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ - if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ panel_off_i = 0; \ panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \ diagoffp_i = diagoffc_i; \ } \ - else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ panel_off_i = bli_abs( diagoffc_i ); \ panel_len_i = panel_len_full - panel_off_i; \ @@ -359,8 +368,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \ diagoffp_i, \ diagc, \ @@ -374,7 +383,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ - } \ + } \ \ p_inc = ldp * panel_len_max_i; \ \ @@ -405,8 +414,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \ diagoffc_i, \ uploc, \ @@ -418,7 +427,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ - } \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -432,8 +441,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - if( packm_thread_my_iter( it, thread ) ) \ - { \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -445,11 +454,12 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ - } \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ \ + } \ /* if ( cs_p == 1 ) { \ PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \ @@ -467,9 +477,8 @@ void PASTEMAC(ch,varname)( \ } \ */ \ \ - } \ \ - p_begin += p_inc; \ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var4.h b/frame/1m/packm/bli_packm_blk_var4.h index 30722c7ae..503cd7fde 100644 --- a/frame/1m/packm/bli_packm_blk_var4.h +++ b/frame/1m/packm/bli_packm_blk_var4.h @@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ + dim_t m_panel, \ + dim_t n_panel, \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ diff --git a/frame/1m/packm/bli_packm_gen_cxk.c b/frame/1m/packm/bli_packm_gen_cxk.c index 299785792..198946e1a 100644 --- a/frame/1m/packm/bli_packm_gen_cxk.c +++ b/frame/1m/packm/bli_packm_gen_cxk.c @@ -62,7 +62,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -71,7 +71,7 @@ void PASTEMAC(ch,varname)( \ ldc = rs_c; \ ldp = rs_p; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ @@ -165,7 +165,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname)( \ ldc = rs_c; \ ldp = rs_p; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ @@ -290,7 +290,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -300,7 +300,7 @@ void PASTEMAC(ch,varname)( \ ldc = rs_c; \ ldp = rs_p; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ diff --git a/frame/1m/packm/bli_packm_herm_cxk.c b/frame/1m/packm/bli_packm_herm_cxk.c index 07d06c3b1..e1bec2814 100644 --- a/frame/1m/packm/bli_packm_herm_cxk.c +++ b/frame/1m/packm/bli_packm_herm_cxk.c @@ -84,7 +84,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -95,7 +95,7 @@ void PASTEMAC(ch,varname)( \ rs_p11 = rs_p; \ cs_p11 = 1; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ @@ -139,14 +139,14 @@ void PASTEMAC(ch,varname)( \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \ - ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \ + if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \ + ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -171,8 +171,8 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( conjc12 ); \ } \ - else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ @@ -347,7 +347,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -359,7 +359,7 @@ void PASTEMAC(ch,varname)( \ rs_p11 = rs_p; \ cs_p11 = 1; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ @@ -409,14 +409,14 @@ void PASTEMAC(ch,varname)( \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \ - ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \ + if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \ + ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -441,8 +441,8 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( conjc12 ); \ } \ - else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ @@ -683,7 +683,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -695,7 +695,7 @@ void PASTEMAC(ch,varname)( \ rs_p11 = rs_p; \ cs_p11 = 1; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ @@ -745,14 +745,14 @@ void PASTEMAC(ch,varname)( \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \ - ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \ + if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \ + ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -777,8 +777,8 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( conjc12 ); \ } \ - else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ - ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 66e383210..713087d46 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -186,6 +186,7 @@ void bli_packm_init_pack( bool_t densify, dim_t nr_pack_dim = nr_def_dim + nr_ext_dim; mem_t* mem_p; + dim_t m_p, n_p; dim_t m_p_pad, n_p_pad; siz_t size_p; siz_t elem_size_p; @@ -242,8 +243,10 @@ void bli_packm_init_pack( bool_t densify, // in p) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. - m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mr_def_dim ); - n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), nr_def_dim ); + m_p = bli_obj_length( *p ); + n_p = bli_obj_width( *p ); + m_p_pad = bli_align_dim_to_mult( m_p, mr_def_dim ); + n_p_pad = bli_align_dim_to_mult( n_p, nr_def_dim ); // Save the padded dimensions into the packed object. It is important // to save these dimensions since they represent the actual dimensions @@ -340,6 +343,8 @@ void bli_packm_init_pack( bool_t densify, bli_obj_set_incs( rs_p, cs_p, *p ); bli_obj_set_panel_dim( m_panel, *p ); bli_obj_set_panel_stride( ps_p, *p ); + bli_obj_set_panel_length( m_panel, *p ); + bli_obj_set_panel_width( n_p, *p ); // Compute the size of the packed buffer. size_p = ps_p * (m_p_pad / m_panel) * elem_size_p; @@ -381,6 +386,8 @@ void bli_packm_init_pack( bool_t densify, bli_obj_set_incs( rs_p, cs_p, *p ); bli_obj_set_panel_dim( n_panel, *p ); bli_obj_set_panel_stride( ps_p, *p ); + bli_obj_set_panel_length( m_p, *p ); + bli_obj_set_panel_width( n_panel, *p ); // Compute the size of the packed buffer. size_p = ps_p * (n_p_pad / n_panel) * elem_size_p; diff --git a/frame/1m/packm/bli_packm_tri_cxk.c b/frame/1m/packm/bli_packm_tri_cxk.c index 254ab5fc1..12d577436 100644 --- a/frame/1m/packm/bli_packm_tri_cxk.c +++ b/frame/1m/packm/bli_packm_tri_cxk.c @@ -64,7 +64,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -73,7 +73,7 @@ void PASTEMAC(ch,varname)( \ ldc = rs_c; \ ldp = rs_p; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ @@ -253,7 +253,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -265,7 +265,7 @@ void PASTEMAC(ch,varname)( \ rs_p11 = rs_p; \ cs_p11 = 1; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ @@ -489,7 +489,7 @@ void PASTEMAC(ch,varname)( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ @@ -501,7 +501,7 @@ void PASTEMAC(ch,varname)( \ rs_p11 = rs_p; \ cs_p11 = 1; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.c b/frame/1m/unpackm/bli_unpackm_blk_var2.c index 4bce3e93f..b203322f5 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var2.c @@ -44,6 +44,8 @@ typedef void (*FUNCPTR_T)( trans_t transc, dim_t m, dim_t n, + dim_t m_panel, + dim_t n_panel, void* p, inc_t rs_p, inc_t cs_p, inc_t pd_p, inc_t ps_p, void* c, inc_t rs_c, inc_t cs_c @@ -79,6 +81,8 @@ void bli_unpackm_blk_var2( obj_t* p, dim_t m_c = bli_obj_length( *c ); dim_t n_c = bli_obj_width( *c ); + dim_t m_panel = bli_obj_panel_length( *c ); + dim_t n_panel = bli_obj_panel_width( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); @@ -104,6 +108,8 @@ void bli_unpackm_blk_var2( obj_t* p, transc, m_c, n_c, + m_panel, + n_panel, buf_p, rs_p, cs_p, pd_p, ps_p, buf_c, rs_c, cs_c ); @@ -121,6 +127,8 @@ void PASTEMAC(ch,varname )( \ trans_t transc, \ dim_t m, \ dim_t n, \ + dim_t m_panel, \ + dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ inc_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c \ @@ -145,8 +153,8 @@ void PASTEMAC(ch,varname )( \ inc_t vs_c; \ inc_t incc, ldc; \ inc_t ldp; \ - dim_t* m_panel; \ - dim_t* n_panel; \ + dim_t* m_panel_full; \ + dim_t* n_panel_full; \ \ \ /* If c needs a transposition, induce it so that we can more simply @@ -162,7 +170,7 @@ void PASTEMAC(ch,varname )( \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ - if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to unpack from column panels. */ \ iter_dim = n; \ @@ -173,10 +181,10 @@ void PASTEMAC(ch,varname )( \ vs_c = cs_c; \ diagoffc_inc = -( doff_t)panel_dim_max; \ ldp = rs_p; \ - m_panel = &m; \ - n_panel = &panel_dim_i; \ + m_panel_full = &m; \ + n_panel_full = &panel_dim_i; \ } \ - else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to unpack from row panels. */ \ iter_dim = m; \ @@ -187,8 +195,8 @@ void PASTEMAC(ch,varname )( \ vs_c = rs_c; \ diagoffc_inc = ( doff_t )panel_dim_max; \ ldp = cs_p; \ - m_panel = &panel_dim_i; \ - n_panel = &n; \ + m_panel_full = &panel_dim_i; \ + n_panel_full = &n; \ } \ \ /* Compute the total number of iterations we'll need. */ \ @@ -215,15 +223,15 @@ void PASTEMAC(ch,varname )( \ lower stored, then we must call scal2m. Otherwise, we can use a variant that is oblivious to structure and storage (and thus tends to be faster). */ \ - if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \ + if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \ bli_is_upper_or_lower( uploc ) ) \ { \ PASTEMAC3(ch,ch,ch,scal2m)( diagoffc_i, \ diagc, \ uploc, \ transc, \ - *m_panel, \ - *n_panel, \ + *m_panel_full, \ + *n_panel_full, \ one, \ p_begin, rs_p, cs_p, \ c_begin, rs_c, cs_c ); \ @@ -239,7 +247,7 @@ void PASTEMAC(ch,varname )( \ c_begin, incc, ldc ); \ } \ \ - /*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel, *n_panel, \ + /*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel_full, *n_panel_full, \ p_begin, rs_p, cs_p, "%4.1f", "" );*/ \ } \ \ diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.h b/frame/1m/unpackm/bli_unpackm_blk_var2.h index e9af5e204..3f38ad793 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.h +++ b/frame/1m/unpackm/bli_unpackm_blk_var2.h @@ -48,6 +48,8 @@ void PASTEMAC(ch,varname)( \ trans_t transc, \ dim_t m, \ dim_t n, \ + dim_t m_panel, \ + dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ inc_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c \ diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 6cdbce3ad..ff6d768d6 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -557,11 +557,17 @@ bli_obj_width_stored( obj ) #define bli_obj_is_row_stored( obj ) \ \ - ( bli_obj_col_stride_mag( obj ) == 1 ) + ( bli_obj_col_stride_mag( obj ) == 1 && \ + ( bli_obj_row_stride_mag( obj ) > 1 || \ + bli_obj_width( obj ) == 1 ) \ + ) #define bli_obj_is_col_stored( obj ) \ \ - ( bli_obj_row_stride_mag( obj ) == 1 ) + ( bli_obj_row_stride_mag( obj ) == 1 && \ + ( bli_obj_col_stride_mag( obj ) > 1 || \ + bli_obj_length( obj ) == 1 ) \ + ) #define bli_obj_is_gen_stored( obj ) \ \ @@ -735,7 +741,7 @@ bli_obj_width_stored( obj ) } -// Packed dimensions query +// Packed matrix info query #define bli_obj_padded_length( obj ) \ \ @@ -745,7 +751,7 @@ bli_obj_width_stored( obj ) \ ( (obj).n_padded ) -// Packed dimensions modification +// Packed matrix info modification #define bli_obj_set_padded_length( m0, obj ) \ { \ @@ -764,48 +770,46 @@ bli_obj_width_stored( obj ) } -// Packed panel dimension query +// Packed panel info query + +#define bli_obj_panel_length( obj ) \ +\ + ((obj).m_panel) + +#define bli_obj_panel_width( obj ) \ +\ + ((obj).n_panel) #define bli_obj_panel_dim( obj ) \ \ ((obj).pd) -// Packed panel dimension modification +#define bli_obj_panel_stride( obj ) \ +\ + ((obj).ps) + +// Packed panel info modification + +#define bli_obj_set_panel_length( m0, obj ) \ +{ \ + (obj).m_panel = m0; \ +} + +#define bli_obj_set_panel_width( n0, obj ) \ +{ \ + (obj).n_panel = n0; \ +} #define bli_obj_set_panel_dim( panel_dim, obj ) \ { \ (obj).pd = panel_dim; \ } - -// Packed panel stride query - -#define bli_obj_panel_stride( obj ) \ -\ - ((obj).ps) - -// Packed panel stride modification - #define bli_obj_set_panel_stride( panel_stride, obj ) \ { \ (obj).ps = panel_stride; \ } - -/* -// Cast mem entry query - -#define bli_obj_cast_mem( obj ) \ -\ - ( &((obj).cast_mem) ) - -// Cast mem entry modification - -#define bli_obj_set_cast_mem( mem_p, obj ) \ -{ \ - (obj).cast_mem = *mem_p; \ -} -*/ // -- Miscellaneous object macros -- diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index cc63c8e0e..6c8cd99be 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -356,6 +356,7 @@ ( bli_does_notrans( trans ) ? ( m == 1 ? (cs) : (rs) ) \ : ( m == 1 ? (rs) : (cs) ) ) +/* #define bli_is_row_stored( rs, cs ) \ \ ( bli_abs( cs ) == 1 ) @@ -363,14 +364,15 @@ #define bli_is_col_stored( rs, cs ) \ \ ( bli_abs( rs ) == 1 ) +*/ -#define bli_is_row_stored_f( rs, cs ) \ +#define bli_is_row_stored_f( m, n, rs, cs ) \ \ - ( cs == 1 ) + ( cs == 1 && ( rs > 1 || n == 1 ) ) -#define bli_is_col_stored_f( rs, cs ) \ +#define bli_is_col_stored_f( m, n, rs, cs ) \ \ - ( rs == 1 ) + ( rs == 1 && ( cs > 1 || m == 1 ) ) #define bli_is_gen_stored( rs, cs ) \ \ @@ -391,14 +393,11 @@ #define bli_has_nonunit_inc2( inc1, inc2 ) \ \ - ( inc1 != 1 || \ - inc2 != 1 ) + ( inc1 != 1 || inc2 != 1 ) #define bli_has_nonunit_inc3( inc1, inc2, inc3 ) \ \ - ( inc1 != 1 || \ - inc2 != 1 || \ - inc3 != 1 ) + ( inc1 != 1 || inc2 != 1 || inc3 != 1 ) // diag offset-related diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 208882a02..c826d07b4 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -522,6 +522,8 @@ typedef struct obj_s inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) + dim_t m_panel; // m dimension of a "full" panel + dim_t n_panel; // n dimension of a "full" panel } obj_t; @@ -565,6 +567,8 @@ typedef struct obj_s (b).n_padded = (a).n_padded; \ (b).ps = (a).ps; \ (b).pd = (a).pd; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ } #define bli_obj_init_subpart_from( a, b ) \ @@ -596,6 +600,8 @@ typedef struct obj_s (b).n_padded = (a).n_padded; \ (b).pd = (a).pd; \ (b).ps = (a).ps; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ }