mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Fixed unit register blocksize brokenness.
Details: - Fixed a breakdown in BLIS's ability to differentiate between row-stored and column-stored micro-panels when MR or NR is unit. When either register blocksize (or both) is equal to one, inspecting the strides of the affected packed micro-panel is no longer sufficient to determine whether the micro-panel is a row-stored column panel or a column-stored row panel (because both strides are unit). At that point, dimension information is necessary when invoking the bli_is_row_stored_f() and bli_is_col_stored_f() macros (and their "obj" counterparts). Thanks to Ilya Polkovnichenko for reporting this bug. - Added panel dimensions (m and n) to obj_t, which are set in packm_init() and then passed into the blocked variants to support the aforementioned update.
This commit is contained in:
@@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
dim_t m_panel,
|
||||
dim_t n_panel,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
@@ -78,6 +80,8 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
dim_t m_panel = bli_obj_panel_length( *p );
|
||||
dim_t n_panel = bli_obj_panel_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
@@ -116,11 +120,13 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
m_panel,
|
||||
n_panel,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p,
|
||||
t );
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
@@ -140,6 +146,8 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
@@ -204,7 +212,7 @@ void PASTEMAC(ch,varname )( \
|
||||
/* If the strides of P indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
@@ -213,7 +221,7 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
@@ -222,7 +230,7 @@ void PASTEMAC(ch,varname )( \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
@@ -264,7 +272,7 @@ void PASTEMAC(ch,varname )( \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
@@ -295,20 +303,20 @@ void PASTEMAC(ch,varname )( \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
@@ -319,8 +327,8 @@ void PASTEMAC(ch,varname )( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -334,8 +342,7 @@ void PASTEMAC(ch,varname )( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
}\
|
||||
\
|
||||
}\
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
@@ -348,8 +355,8 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -361,7 +368,7 @@ void PASTEMAC(ch,varname )( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -375,8 +382,19 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
/*
|
||||
printf( "packm_var1: gen case\n" ); \
|
||||
printf( "packm_var1: m_panel_use = %d\n", *m_panel_use ); \
|
||||
printf( "packm_var1: n_panel_use = %d\n", *n_panel_use ); \
|
||||
printf( "packm_var1: m_panel_max = %d\n", *m_panel_max ); \
|
||||
printf( "packm_var1: n_panel_max = %d\n", *n_panel_max ); \
|
||||
printf( "packm_var1: m_panel = %d\n", m_panel ); \
|
||||
printf( "packm_var1: n_panel = %d\n", n_panel ); \
|
||||
printf( "packm_var1: rs_c cs_c = %d %d\n", rs_c, cs_c ); \
|
||||
printf( "packm_var1: rs_p cs_p = %d %d\n", rs_p, cs_p ); \
|
||||
*/ \
|
||||
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -388,23 +406,32 @@ void PASTEMAC(ch,varname )( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
/*
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
else if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
*/ \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( rs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", panel_dim_max, panel_len_max_i, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
if ( cs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", panel_len_max_i, panel_dim_max, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
p_begin, rs_p, cs_p, "%9.2e", "" ); \
|
||||
*/ \
|
||||
\
|
||||
}
|
||||
|
||||
@@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
|
||||
@@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
dim_t m_panel,
|
||||
dim_t n_panel,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
@@ -78,6 +80,8 @@ void bli_packm_blk_var3( obj_t* c,
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
dim_t m_panel = bli_obj_panel_length( *p );
|
||||
dim_t n_panel = bli_obj_panel_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
@@ -111,26 +115,27 @@ void bli_packm_blk_var3( obj_t* c,
|
||||
// real domain counterparts. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( thread_am_ochief( t ) ) {
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
if ( thread_am_ochief( t ) )
|
||||
{
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
@@ -156,11 +161,13 @@ void bli_packm_blk_var3( obj_t* c,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
m_panel,
|
||||
n_panel,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p,
|
||||
t );
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
@@ -180,6 +187,8 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
@@ -244,7 +253,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of P indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
@@ -253,7 +262,7 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
@@ -262,7 +271,7 @@ void PASTEMAC(ch,varname)( \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
@@ -304,8 +313,8 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
@@ -335,20 +344,20 @@ void PASTEMAC(ch,varname)( \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
@@ -359,8 +368,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -374,7 +383,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
@@ -398,8 +407,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -411,8 +420,8 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
} \
|
||||
@@ -425,8 +434,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -438,11 +447,12 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
\
|
||||
} \
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
|
||||
@@ -452,9 +462,8 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
|
||||
@@ -49,6 +49,8 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
dim_t m_panel,
|
||||
dim_t n_panel,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
@@ -78,6 +80,8 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
dim_t m_panel = bli_obj_panel_length( *p );
|
||||
dim_t n_panel = bli_obj_panel_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
@@ -111,26 +115,27 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
// real domain counterparts. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if( thread_am_ochief( t ) ) {
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
if( thread_am_ochief( t ) )
|
||||
{
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
@@ -156,11 +161,13 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
m_panel,
|
||||
n_panel,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p,
|
||||
t );
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
@@ -180,6 +187,8 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
@@ -244,7 +253,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of P indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
@@ -253,7 +262,7 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
@@ -262,7 +271,7 @@ void PASTEMAC(ch,varname)( \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
@@ -304,8 +313,8 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
@@ -335,20 +344,20 @@ void PASTEMAC(ch,varname)( \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
@@ -359,8 +368,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -374,7 +383,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
@@ -405,8 +414,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -418,7 +427,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -432,8 +441,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -445,11 +454,12 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
} \
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
|
||||
@@ -467,9 +477,8 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -53,6 +53,8 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
|
||||
@@ -62,7 +62,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -71,7 +71,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
@@ -165,7 +165,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
@@ -290,7 +290,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -300,7 +300,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
|
||||
@@ -84,7 +84,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -95,7 +95,7 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
@@ -139,14 +139,14 @@ void PASTEMAC(ch,varname)( \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
@@ -171,8 +171,8 @@ void PASTEMAC(ch,varname)( \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
@@ -347,7 +347,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -359,7 +359,7 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
@@ -409,14 +409,14 @@ void PASTEMAC(ch,varname)( \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
@@ -441,8 +441,8 @@ void PASTEMAC(ch,varname)( \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
@@ -683,7 +683,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -695,7 +695,7 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
@@ -745,14 +745,14 @@ void PASTEMAC(ch,varname)( \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
if ( ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
@@ -777,8 +777,8 @@ void PASTEMAC(ch,varname)( \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
|
||||
@@ -186,6 +186,7 @@ void bli_packm_init_pack( bool_t densify,
|
||||
dim_t nr_pack_dim = nr_def_dim + nr_ext_dim;
|
||||
|
||||
mem_t* mem_p;
|
||||
dim_t m_p, n_p;
|
||||
dim_t m_p_pad, n_p_pad;
|
||||
siz_t size_p;
|
||||
siz_t elem_size_p;
|
||||
@@ -242,8 +243,10 @@ void bli_packm_init_pack( bool_t densify,
|
||||
// in p) and aligning them to the dimension multiples (typically equal
|
||||
// to register blocksizes). This does waste a little bit of space for
|
||||
// level-2 operations, but that's okay with us.
|
||||
m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mr_def_dim );
|
||||
n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), nr_def_dim );
|
||||
m_p = bli_obj_length( *p );
|
||||
n_p = bli_obj_width( *p );
|
||||
m_p_pad = bli_align_dim_to_mult( m_p, mr_def_dim );
|
||||
n_p_pad = bli_align_dim_to_mult( n_p, nr_def_dim );
|
||||
|
||||
// Save the padded dimensions into the packed object. It is important
|
||||
// to save these dimensions since they represent the actual dimensions
|
||||
@@ -340,6 +343,8 @@ void bli_packm_init_pack( bool_t densify,
|
||||
bli_obj_set_incs( rs_p, cs_p, *p );
|
||||
bli_obj_set_panel_dim( m_panel, *p );
|
||||
bli_obj_set_panel_stride( ps_p, *p );
|
||||
bli_obj_set_panel_length( m_panel, *p );
|
||||
bli_obj_set_panel_width( n_p, *p );
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = ps_p * (m_p_pad / m_panel) * elem_size_p;
|
||||
@@ -381,6 +386,8 @@ void bli_packm_init_pack( bool_t densify,
|
||||
bli_obj_set_incs( rs_p, cs_p, *p );
|
||||
bli_obj_set_panel_dim( n_panel, *p );
|
||||
bli_obj_set_panel_stride( ps_p, *p );
|
||||
bli_obj_set_panel_length( m_p, *p );
|
||||
bli_obj_set_panel_width( n_panel, *p );
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = ps_p * (n_p_pad / n_panel) * elem_size_p;
|
||||
|
||||
@@ -64,7 +64,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -73,7 +73,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
@@ -253,7 +253,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -265,7 +265,7 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
@@ -489,7 +489,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
@@ -501,7 +501,7 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
|
||||
@@ -44,6 +44,8 @@ typedef void (*FUNCPTR_T)(
|
||||
trans_t transc,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_panel,
|
||||
dim_t n_panel,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
inc_t pd_p, inc_t ps_p,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
@@ -79,6 +81,8 @@ void bli_unpackm_blk_var2( obj_t* p,
|
||||
|
||||
dim_t m_c = bli_obj_length( *c );
|
||||
dim_t n_c = bli_obj_width( *c );
|
||||
dim_t m_panel = bli_obj_panel_length( *c );
|
||||
dim_t n_panel = bli_obj_panel_width( *c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( *p );
|
||||
inc_t rs_p = bli_obj_row_stride( *p );
|
||||
@@ -104,6 +108,8 @@ void bli_unpackm_blk_var2( obj_t* p,
|
||||
transc,
|
||||
m_c,
|
||||
n_c,
|
||||
m_panel,
|
||||
n_panel,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p,
|
||||
buf_c, rs_c, cs_c );
|
||||
@@ -121,6 +127,8 @@ void PASTEMAC(ch,varname )( \
|
||||
trans_t transc, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t pd_p, inc_t ps_p, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
@@ -145,8 +153,8 @@ void PASTEMAC(ch,varname )( \
|
||||
inc_t vs_c; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
dim_t* m_panel; \
|
||||
dim_t* n_panel; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
\
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
@@ -162,7 +170,7 @@ void PASTEMAC(ch,varname )( \
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to unpack from column panels. */ \
|
||||
iter_dim = n; \
|
||||
@@ -173,10 +181,10 @@ void PASTEMAC(ch,varname )( \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel = &m; \
|
||||
n_panel = &panel_dim_i; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to unpack from row panels. */ \
|
||||
iter_dim = m; \
|
||||
@@ -187,8 +195,8 @@ void PASTEMAC(ch,varname )( \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel = &panel_dim_i; \
|
||||
n_panel = &n; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
@@ -215,15 +223,15 @@ void PASTEMAC(ch,varname )( \
|
||||
lower stored, then we must call scal2m. Otherwise, we can use a
|
||||
variant that is oblivious to structure and storage (and thus tends
|
||||
to be faster). */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_i, *m_panel, *n_panel ) && \
|
||||
if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \
|
||||
bli_is_upper_or_lower( uploc ) ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2m)( diagoffc_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
transc, \
|
||||
*m_panel, \
|
||||
*n_panel, \
|
||||
*m_panel_full, \
|
||||
*n_panel_full, \
|
||||
one, \
|
||||
p_begin, rs_p, cs_p, \
|
||||
c_begin, rs_c, cs_c ); \
|
||||
@@ -239,7 +247,7 @@ void PASTEMAC(ch,varname )( \
|
||||
c_begin, incc, ldc ); \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel, *n_panel, \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel_full, *n_panel_full, \
|
||||
p_begin, rs_p, cs_p, "%4.1f", "" );*/ \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -48,6 +48,8 @@ void PASTEMAC(ch,varname)( \
|
||||
trans_t transc, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t pd_p, inc_t ps_p, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
|
||||
@@ -557,11 +557,17 @@ bli_obj_width_stored( obj )
|
||||
|
||||
#define bli_obj_is_row_stored( obj ) \
|
||||
\
|
||||
( bli_obj_col_stride_mag( obj ) == 1 )
|
||||
( bli_obj_col_stride_mag( obj ) == 1 && \
|
||||
( bli_obj_row_stride_mag( obj ) > 1 || \
|
||||
bli_obj_width( obj ) == 1 ) \
|
||||
)
|
||||
|
||||
#define bli_obj_is_col_stored( obj ) \
|
||||
\
|
||||
( bli_obj_row_stride_mag( obj ) == 1 )
|
||||
( bli_obj_row_stride_mag( obj ) == 1 && \
|
||||
( bli_obj_col_stride_mag( obj ) > 1 || \
|
||||
bli_obj_length( obj ) == 1 ) \
|
||||
)
|
||||
|
||||
#define bli_obj_is_gen_stored( obj ) \
|
||||
\
|
||||
@@ -735,7 +741,7 @@ bli_obj_width_stored( obj )
|
||||
}
|
||||
|
||||
|
||||
// Packed dimensions query
|
||||
// Packed matrix info query
|
||||
|
||||
#define bli_obj_padded_length( obj ) \
|
||||
\
|
||||
@@ -745,7 +751,7 @@ bli_obj_width_stored( obj )
|
||||
\
|
||||
( (obj).n_padded )
|
||||
|
||||
// Packed dimensions modification
|
||||
// Packed matrix info modification
|
||||
|
||||
#define bli_obj_set_padded_length( m0, obj ) \
|
||||
{ \
|
||||
@@ -764,48 +770,46 @@ bli_obj_width_stored( obj )
|
||||
}
|
||||
|
||||
|
||||
// Packed panel dimension query
|
||||
// Packed panel info query
|
||||
|
||||
#define bli_obj_panel_length( obj ) \
|
||||
\
|
||||
((obj).m_panel)
|
||||
|
||||
#define bli_obj_panel_width( obj ) \
|
||||
\
|
||||
((obj).n_panel)
|
||||
|
||||
#define bli_obj_panel_dim( obj ) \
|
||||
\
|
||||
((obj).pd)
|
||||
|
||||
// Packed panel dimension modification
|
||||
#define bli_obj_panel_stride( obj ) \
|
||||
\
|
||||
((obj).ps)
|
||||
|
||||
// Packed panel info modification
|
||||
|
||||
#define bli_obj_set_panel_length( m0, obj ) \
|
||||
{ \
|
||||
(obj).m_panel = m0; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_width( n0, obj ) \
|
||||
{ \
|
||||
(obj).n_panel = n0; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_dim( panel_dim, obj ) \
|
||||
{ \
|
||||
(obj).pd = panel_dim; \
|
||||
}
|
||||
|
||||
|
||||
// Packed panel stride query
|
||||
|
||||
#define bli_obj_panel_stride( obj ) \
|
||||
\
|
||||
((obj).ps)
|
||||
|
||||
// Packed panel stride modification
|
||||
|
||||
#define bli_obj_set_panel_stride( panel_stride, obj ) \
|
||||
{ \
|
||||
(obj).ps = panel_stride; \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
// Cast mem entry query
|
||||
|
||||
#define bli_obj_cast_mem( obj ) \
|
||||
\
|
||||
( &((obj).cast_mem) )
|
||||
|
||||
// Cast mem entry modification
|
||||
|
||||
#define bli_obj_set_cast_mem( mem_p, obj ) \
|
||||
{ \
|
||||
(obj).cast_mem = *mem_p; \
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// -- Miscellaneous object macros --
|
||||
|
||||
@@ -356,6 +356,7 @@
|
||||
( bli_does_notrans( trans ) ? ( m == 1 ? (cs) : (rs) ) \
|
||||
: ( m == 1 ? (rs) : (cs) ) )
|
||||
|
||||
/*
|
||||
#define bli_is_row_stored( rs, cs ) \
|
||||
\
|
||||
( bli_abs( cs ) == 1 )
|
||||
@@ -363,14 +364,15 @@
|
||||
#define bli_is_col_stored( rs, cs ) \
|
||||
\
|
||||
( bli_abs( rs ) == 1 )
|
||||
*/
|
||||
|
||||
#define bli_is_row_stored_f( rs, cs ) \
|
||||
#define bli_is_row_stored_f( m, n, rs, cs ) \
|
||||
\
|
||||
( cs == 1 )
|
||||
( cs == 1 && ( rs > 1 || n == 1 ) )
|
||||
|
||||
#define bli_is_col_stored_f( rs, cs ) \
|
||||
#define bli_is_col_stored_f( m, n, rs, cs ) \
|
||||
\
|
||||
( rs == 1 )
|
||||
( rs == 1 && ( cs > 1 || m == 1 ) )
|
||||
|
||||
#define bli_is_gen_stored( rs, cs ) \
|
||||
\
|
||||
@@ -391,14 +393,11 @@
|
||||
|
||||
#define bli_has_nonunit_inc2( inc1, inc2 ) \
|
||||
\
|
||||
( inc1 != 1 || \
|
||||
inc2 != 1 )
|
||||
( inc1 != 1 || inc2 != 1 )
|
||||
|
||||
#define bli_has_nonunit_inc3( inc1, inc2, inc3 ) \
|
||||
\
|
||||
( inc1 != 1 || \
|
||||
inc2 != 1 || \
|
||||
inc3 != 1 )
|
||||
( inc1 != 1 || inc2 != 1 || inc3 != 1 )
|
||||
|
||||
|
||||
// diag offset-related
|
||||
|
||||
@@ -522,6 +522,8 @@ typedef struct obj_s
|
||||
inc_t ps; // panel stride (distance to next panel)
|
||||
inc_t pd; // panel dimension (the "width" of a panel:
|
||||
// usually MR or NR)
|
||||
dim_t m_panel; // m dimension of a "full" panel
|
||||
dim_t n_panel; // n dimension of a "full" panel
|
||||
} obj_t;
|
||||
|
||||
|
||||
@@ -565,6 +567,8 @@ typedef struct obj_s
|
||||
(b).n_padded = (a).n_padded; \
|
||||
(b).ps = (a).ps; \
|
||||
(b).pd = (a).pd; \
|
||||
(b).m_panel = (a).m_panel; \
|
||||
(b).n_panel = (a).n_panel; \
|
||||
}
|
||||
|
||||
#define bli_obj_init_subpart_from( a, b ) \
|
||||
@@ -596,6 +600,8 @@ typedef struct obj_s
|
||||
(b).n_padded = (a).n_padded; \
|
||||
(b).pd = (a).pd; \
|
||||
(b).ps = (a).ps; \
|
||||
(b).m_panel = (a).m_panel; \
|
||||
(b).n_panel = (a).n_panel; \
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user