Fixed unit register blocksize brokenness.

Details:
- Fixed a breakdown in BLIS's ability to differentiate between row-stored
  and column-stored micro-panels when MR or NR is unit. When either
  register blocksize (or both) is equal to one, inspecting the strides of
  the affected packed micro-panel is no longer sufficient to determine
  whether the micro-panel is a row-stored column panel or a column-stored
  row panel (because both strides are unit). At that point, dimension
  information is necessary when invoking the bli_is_row_stored_f() and
  bli_is_col_stored_f() macros (and their "obj" counterparts). Thanks to
  Ilya Polkovnichenko for reporting this bug.
- Added panel dimensions (m and n) to obj_t, which are set in
  packm_init() and then passed into the blocked variants to support the
  aforementioned update.
This commit is contained in:
Field G. Van Zee
2014-07-30 10:41:48 -05:00
parent c2732272f0
commit a51e32ec06
15 changed files with 273 additions and 196 deletions

View File

@@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)(
dim_t n,
dim_t m_max,
dim_t n_max,
dim_t m_panel,
dim_t n_panel,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
@@ -78,6 +80,8 @@ void bli_packm_blk_var1( obj_t* c,
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
dim_t m_panel = bli_obj_panel_length( *p );
dim_t n_panel = bli_obj_panel_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
@@ -116,11 +120,13 @@ void bli_packm_blk_var1( obj_t* c,
n_p,
m_max_p,
n_max_p,
m_panel,
n_panel,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p,
t );
t );
}
@@ -140,6 +146,8 @@ void PASTEMAC(ch,varname )( \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
dim_t m_panel, \
dim_t n_panel, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
@@ -204,7 +212,7 @@ void PASTEMAC(ch,varname )( \
/* If the strides of P indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
@@ -213,7 +221,7 @@ void PASTEMAC(ch,varname )( \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim_max; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
@@ -222,7 +230,7 @@ void PASTEMAC(ch,varname )( \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
@@ -264,7 +272,7 @@ void PASTEMAC(ch,varname )( \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
@@ -295,20 +303,20 @@ void PASTEMAC(ch,varname )( \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
@@ -319,8 +327,8 @@ void PASTEMAC(ch,varname )( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk)( strucc, \
diagoffp_i, \
diagc, \
@@ -334,8 +342,7 @@ void PASTEMAC(ch,varname )( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
}\
\
}\
\
p_inc = ldp * panel_len_max_i; \
} \
@@ -348,8 +355,8 @@ void PASTEMAC(ch,varname )( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk)( strucc, \
diagoffc_i, \
uploc, \
@@ -361,7 +368,7 @@ void PASTEMAC(ch,varname )( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -375,8 +382,19 @@ void PASTEMAC(ch,varname )( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
/*
printf( "packm_var1: gen case\n" ); \
printf( "packm_var1: m_panel_use = %d\n", *m_panel_use ); \
printf( "packm_var1: n_panel_use = %d\n", *n_panel_use ); \
printf( "packm_var1: m_panel_max = %d\n", *m_panel_max ); \
printf( "packm_var1: n_panel_max = %d\n", *n_panel_max ); \
printf( "packm_var1: m_panel = %d\n", m_panel ); \
printf( "packm_var1: n_panel = %d\n", n_panel ); \
printf( "packm_var1: rs_c cs_c = %d %d\n", rs_c, cs_c ); \
printf( "packm_var1: rs_p cs_p = %d %d\n", rs_p, cs_p ); \
*/ \
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -388,23 +406,32 @@ void PASTEMAC(ch,varname )( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
/*
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
else if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
*/ \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
} \
} \
\
\
p_begin += p_inc; \
p_begin += p_inc; \
} \
\
\
/*
if ( rs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
p_begin, rs_p, cs_p, "%4.1f", "" ); \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
if ( cs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
p_begin, rs_p, cs_p, "%4.1f", "" ); \
p_begin, rs_p, cs_p, "%9.2e", "" ); \
*/ \
\
}

View File

@@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
dim_t m_panel, \
dim_t n_panel, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \

View File

@@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)(
dim_t n,
dim_t m_max,
dim_t n_max,
dim_t m_panel,
dim_t n_panel,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
@@ -78,6 +80,8 @@ void bli_packm_blk_var3( obj_t* c,
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
dim_t m_panel = bli_obj_panel_length( *p );
dim_t n_panel = bli_obj_panel_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
@@ -111,26 +115,27 @@ void bli_packm_blk_var3( obj_t* c,
// real domain counterparts. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( thread_am_ochief( t ) ) {
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
if ( thread_am_ochief( t ) )
{
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
@@ -156,11 +161,13 @@ void bli_packm_blk_var3( obj_t* c,
n_p,
m_max_p,
n_max_p,
m_panel,
n_panel,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p,
t );
t );
}
@@ -180,6 +187,8 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
dim_t m_panel, \
dim_t n_panel, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
@@ -244,7 +253,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of P indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
@@ -253,7 +262,7 @@ void PASTEMAC(ch,varname)( \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim_max; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
@@ -262,7 +271,7 @@ void PASTEMAC(ch,varname)( \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
@@ -304,8 +313,8 @@ void PASTEMAC(ch,varname)( \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
@@ -335,20 +344,20 @@ void PASTEMAC(ch,varname)( \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
@@ -359,8 +368,8 @@ void PASTEMAC(ch,varname)( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \
diagoffp_i, \
diagc, \
@@ -374,7 +383,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
} \
\
\
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
@@ -398,8 +407,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \
diagoffc_i, \
uploc, \
@@ -411,8 +420,8 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
} \
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
} \
@@ -425,8 +434,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -438,11 +447,12 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
\
} \
/*
if ( cs_p == 1 ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
@@ -452,9 +462,8 @@ void PASTEMAC(ch,varname)( \
} \
*/ \
\
} \
\
p_begin += p_inc; \
p_begin += p_inc; \
} \
}

View File

@@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
dim_t m_panel, \
dim_t n_panel, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \

View File

@@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)(
dim_t n,
dim_t m_max,
dim_t n_max,
dim_t m_panel,
dim_t n_panel,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
@@ -78,6 +80,8 @@ void bli_packm_blk_var4( obj_t* c,
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
dim_t m_panel = bli_obj_panel_length( *p );
dim_t n_panel = bli_obj_panel_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
@@ -111,26 +115,27 @@ void bli_packm_blk_var4( obj_t* c,
// real domain counterparts. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if( thread_am_ochief( t ) ) {
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
if( thread_am_ochief( t ) )
{
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
@@ -156,11 +161,13 @@ void bli_packm_blk_var4( obj_t* c,
n_p,
m_max_p,
n_max_p,
m_panel,
n_panel,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p,
t );
t );
}
@@ -180,6 +187,8 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
dim_t m_panel, \
dim_t n_panel, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
@@ -244,7 +253,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of P indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
@@ -253,7 +262,7 @@ void PASTEMAC(ch,varname)( \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim_max; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
@@ -262,7 +271,7 @@ void PASTEMAC(ch,varname)( \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
@@ -304,8 +313,8 @@ void PASTEMAC(ch,varname)( \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
@@ -335,20 +344,20 @@ void PASTEMAC(ch,varname)( \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
@@ -359,8 +368,8 @@ void PASTEMAC(ch,varname)( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \
diagoffp_i, \
diagc, \
@@ -374,7 +383,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
} \
\
p_inc = ldp * panel_len_max_i; \
\
@@ -405,8 +414,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \
diagoffc_i, \
uploc, \
@@ -418,7 +427,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -432,8 +441,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -445,11 +454,12 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
\
} \
/*
if ( cs_p == 1 ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
@@ -467,9 +477,8 @@ void PASTEMAC(ch,varname)( \
} \
*/ \
\
} \
\
p_begin += p_inc; \
p_begin += p_inc; \
} \
}

View File

@@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
dim_t m_panel, \
dim_t n_panel, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \

View File

@@ -62,7 +62,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -71,7 +71,7 @@ void PASTEMAC(ch,varname)( \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
@@ -165,7 +165,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname)( \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
@@ -290,7 +290,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -300,7 +300,7 @@ void PASTEMAC(ch,varname)( \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \

View File

@@ -84,7 +84,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -95,7 +95,7 @@ void PASTEMAC(ch,varname)( \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
@@ -139,14 +139,14 @@ void PASTEMAC(ch,varname)( \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
@@ -171,8 +171,8 @@ void PASTEMAC(ch,varname)( \
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
@@ -347,7 +347,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -359,7 +359,7 @@ void PASTEMAC(ch,varname)( \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
@@ -409,14 +409,14 @@ void PASTEMAC(ch,varname)( \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
@@ -441,8 +441,8 @@ void PASTEMAC(ch,varname)( \
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
@@ -683,7 +683,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -695,7 +695,7 @@ void PASTEMAC(ch,varname)( \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
@@ -745,14 +745,14 @@ void PASTEMAC(ch,varname)( \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
@@ -777,8 +777,8 @@ void PASTEMAC(ch,varname)( \
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( conjc12 ); \
} \
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \

View File

@@ -186,6 +186,7 @@ void bli_packm_init_pack( bool_t densify,
dim_t nr_pack_dim = nr_def_dim + nr_ext_dim;
mem_t* mem_p;
dim_t m_p, n_p;
dim_t m_p_pad, n_p_pad;
siz_t size_p;
siz_t elem_size_p;
@@ -242,8 +243,10 @@ void bli_packm_init_pack( bool_t densify,
// in p) and aligning them to the dimension multiples (typically equal
// to register blocksizes). This does waste a little bit of space for
// level-2 operations, but that's okay with us.
m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mr_def_dim );
n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), nr_def_dim );
m_p = bli_obj_length( *p );
n_p = bli_obj_width( *p );
m_p_pad = bli_align_dim_to_mult( m_p, mr_def_dim );
n_p_pad = bli_align_dim_to_mult( n_p, nr_def_dim );
// Save the padded dimensions into the packed object. It is important
// to save these dimensions since they represent the actual dimensions
@@ -340,6 +343,8 @@ void bli_packm_init_pack( bool_t densify,
bli_obj_set_incs( rs_p, cs_p, *p );
bli_obj_set_panel_dim( m_panel, *p );
bli_obj_set_panel_stride( ps_p, *p );
bli_obj_set_panel_length( m_panel, *p );
bli_obj_set_panel_width( n_p, *p );
// Compute the size of the packed buffer.
size_p = ps_p * (m_p_pad / m_panel) * elem_size_p;
@@ -381,6 +386,8 @@ void bli_packm_init_pack( bool_t densify,
bli_obj_set_incs( rs_p, cs_p, *p );
bli_obj_set_panel_dim( n_panel, *p );
bli_obj_set_panel_stride( ps_p, *p );
bli_obj_set_panel_length( m_p, *p );
bli_obj_set_panel_width( n_panel, *p );
// Compute the size of the packed buffer.
size_p = ps_p * (n_p_pad / n_panel) * elem_size_p;

View File

@@ -64,7 +64,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -73,7 +73,7 @@ void PASTEMAC(ch,varname)( \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
@@ -253,7 +253,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -265,7 +265,7 @@ void PASTEMAC(ch,varname)( \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
@@ -489,7 +489,7 @@ void PASTEMAC(ch,varname)( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
@@ -501,7 +501,7 @@ void PASTEMAC(ch,varname)( \
rs_p11 = rs_p; \
cs_p11 = 1; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \

View File

@@ -44,6 +44,8 @@ typedef void (*FUNCPTR_T)(
trans_t transc,
dim_t m,
dim_t n,
dim_t m_panel,
dim_t n_panel,
void* p, inc_t rs_p, inc_t cs_p,
inc_t pd_p, inc_t ps_p,
void* c, inc_t rs_c, inc_t cs_c
@@ -79,6 +81,8 @@ void bli_unpackm_blk_var2( obj_t* p,
dim_t m_c = bli_obj_length( *c );
dim_t n_c = bli_obj_width( *c );
dim_t m_panel = bli_obj_panel_length( *c );
dim_t n_panel = bli_obj_panel_width( *c );
void* buf_p = bli_obj_buffer_at_off( *p );
inc_t rs_p = bli_obj_row_stride( *p );
@@ -104,6 +108,8 @@ void bli_unpackm_blk_var2( obj_t* p,
transc,
m_c,
n_c,
m_panel,
n_panel,
buf_p, rs_p, cs_p,
pd_p, ps_p,
buf_c, rs_c, cs_c );
@@ -121,6 +127,8 @@ void PASTEMAC(ch,varname )( \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_panel, \
dim_t n_panel, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t pd_p, inc_t ps_p, \
void* c, inc_t rs_c, inc_t cs_c \
@@ -145,8 +153,8 @@ void PASTEMAC(ch,varname )( \
inc_t vs_c; \
inc_t incc, ldc; \
inc_t ldp; \
dim_t* m_panel; \
dim_t* n_panel; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
\
\
/* If c needs a transposition, induce it so that we can more simply
@@ -162,7 +170,7 @@ void PASTEMAC(ch,varname )( \
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to unpack from column panels. */ \
iter_dim = n; \
@@ -173,10 +181,10 @@ void PASTEMAC(ch,varname )( \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim_max; \
ldp = rs_p; \
m_panel = &m; \
n_panel = &panel_dim_i; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
} \
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to unpack from row panels. */ \
iter_dim = m; \
@@ -187,8 +195,8 @@ void PASTEMAC(ch,varname )( \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel = &panel_dim_i; \
n_panel = &n; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
} \
\
/* Compute the total number of iterations we'll need. */ \
@@ -215,15 +223,15 @@ void PASTEMAC(ch,varname )( \
lower stored, then we must call scal2m. Otherwise, we can use a
variant that is oblivious to structure and storage (and thus tends
to be faster). */ \
if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \
bli_is_upper_or_lower( uploc ) ) \
{ \
PASTEMAC3(ch,ch,ch,scal2m)( diagoffc_i, \
diagc, \
uploc, \
transc, \
*m_panel, \
*n_panel, \
*m_panel_full, \
*n_panel_full, \
one, \
p_begin, rs_p, cs_p, \
c_begin, rs_c, cs_c ); \
@@ -239,7 +247,7 @@ void PASTEMAC(ch,varname )( \
c_begin, incc, ldc ); \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel, *n_panel, \
/*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel_full, *n_panel_full, \
p_begin, rs_p, cs_p, "%4.1f", "" );*/ \
} \
\

View File

@@ -48,6 +48,8 @@ void PASTEMAC(ch,varname)( \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_panel, \
dim_t n_panel, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t pd_p, inc_t ps_p, \
void* c, inc_t rs_c, inc_t cs_c \

View File

@@ -557,11 +557,17 @@ bli_obj_width_stored( obj )
#define bli_obj_is_row_stored( obj ) \
\
( bli_obj_col_stride_mag( obj ) == 1 )
( bli_obj_col_stride_mag( obj ) == 1 && \
( bli_obj_row_stride_mag( obj ) > 1 || \
bli_obj_width( obj ) == 1 ) \
)
#define bli_obj_is_col_stored( obj ) \
\
( bli_obj_row_stride_mag( obj ) == 1 )
( bli_obj_row_stride_mag( obj ) == 1 && \
( bli_obj_col_stride_mag( obj ) > 1 || \
bli_obj_length( obj ) == 1 ) \
)
#define bli_obj_is_gen_stored( obj ) \
\
@@ -735,7 +741,7 @@ bli_obj_width_stored( obj )
}
// Packed dimensions query
// Packed matrix info query
#define bli_obj_padded_length( obj ) \
\
@@ -745,7 +751,7 @@ bli_obj_width_stored( obj )
\
( (obj).n_padded )
// Packed dimensions modification
// Packed matrix info modification
#define bli_obj_set_padded_length( m0, obj ) \
{ \
@@ -764,48 +770,46 @@ bli_obj_width_stored( obj )
}
// Packed panel dimension query
// Packed panel info query
#define bli_obj_panel_length( obj ) \
\
((obj).m_panel)
#define bli_obj_panel_width( obj ) \
\
((obj).n_panel)
#define bli_obj_panel_dim( obj ) \
\
((obj).pd)
// Packed panel dimension modification
#define bli_obj_panel_stride( obj ) \
\
((obj).ps)
// Packed panel info modification
#define bli_obj_set_panel_length( m0, obj ) \
{ \
(obj).m_panel = m0; \
}
#define bli_obj_set_panel_width( n0, obj ) \
{ \
(obj).n_panel = n0; \
}
#define bli_obj_set_panel_dim( panel_dim, obj ) \
{ \
(obj).pd = panel_dim; \
}
// Packed panel stride query
#define bli_obj_panel_stride( obj ) \
\
((obj).ps)
// Packed panel stride modification
#define bli_obj_set_panel_stride( panel_stride, obj ) \
{ \
(obj).ps = panel_stride; \
}
/*
// Cast mem entry query
#define bli_obj_cast_mem( obj ) \
\
( &((obj).cast_mem) )
// Cast mem entry modification
#define bli_obj_set_cast_mem( mem_p, obj ) \
{ \
(obj).cast_mem = *mem_p; \
}
*/
// -- Miscellaneous object macros --

View File

@@ -356,6 +356,7 @@
( bli_does_notrans( trans ) ? ( m == 1 ? (cs) : (rs) ) \
: ( m == 1 ? (rs) : (cs) ) )
/*
#define bli_is_row_stored( rs, cs ) \
\
( bli_abs( cs ) == 1 )
@@ -363,14 +364,15 @@
#define bli_is_col_stored( rs, cs ) \
\
( bli_abs( rs ) == 1 )
*/
#define bli_is_row_stored_f( rs, cs ) \
#define bli_is_row_stored_f( m, n, rs, cs ) \
\
( cs == 1 )
( cs == 1 && ( rs > 1 || n == 1 ) )
#define bli_is_col_stored_f( rs, cs ) \
#define bli_is_col_stored_f( m, n, rs, cs ) \
\
( rs == 1 )
( rs == 1 && ( cs > 1 || m == 1 ) )
#define bli_is_gen_stored( rs, cs ) \
\
@@ -391,14 +393,11 @@
#define bli_has_nonunit_inc2( inc1, inc2 ) \
\
( inc1 != 1 || \
inc2 != 1 )
( inc1 != 1 || inc2 != 1 )
#define bli_has_nonunit_inc3( inc1, inc2, inc3 ) \
\
( inc1 != 1 || \
inc2 != 1 || \
inc3 != 1 )
( inc1 != 1 || inc2 != 1 || inc3 != 1 )
// diag offset-related

View File

@@ -522,6 +522,8 @@ typedef struct obj_s
inc_t ps; // panel stride (distance to next panel)
inc_t pd; // panel dimension (the "width" of a panel:
// usually MR or NR)
dim_t m_panel; // m dimension of a "full" panel
dim_t n_panel; // n dimension of a "full" panel
} obj_t;
@@ -565,6 +567,8 @@ typedef struct obj_s
(b).n_padded = (a).n_padded; \
(b).ps = (a).ps; \
(b).pd = (a).pd; \
(b).m_panel = (a).m_panel; \
(b).n_panel = (a).n_panel; \
}
#define bli_obj_init_subpart_from( a, b ) \
@@ -596,6 +600,8 @@ typedef struct obj_s
(b).n_padded = (a).n_padded; \
(b).pd = (a).pd; \
(b).ps = (a).ps; \
(b).m_panel = (a).m_panel; \
(b).n_panel = (a).n_panel; \
}