mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Fixed an obscure bug in 3mh/3m/4mh/4m packing.
Details: - Modified bli_packm_blk_var1.c and _var2.c to increase the triangular case's panel increment by 1 if it would otherwise be odd. This is particularly necessary in _var2.c when handling the interleaved 3m or ro/io/rpi pack schemas, since division of an odd number by 2 can happen if both the panel length and the panel packing dimension (register packing blocksize) are odd, thus making their product odd. - Modified bli_packm_init.c so that panel strides are increased by 1 if they would otherwise be odd, even for non-3m related packing. - Modified the trmm and trsm macro-kernels so that triangular packed micro-panels are traversed with this new "increment by 1 if odd" policy. - Added sanity checks in trmm and trsm macro-kernels that would result in an abort() if the conditions that would lead to a "divide odd integer by 2" scenario ever manifest. - Defined bli_is_odd(), _is_even() macros in bli_scalar_macro_defs.h.
This commit is contained in:
@@ -374,6 +374,9 @@ void PASTEMAC(ch,varname)( \
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the panel increment up by one if it is odd. */ \
|
||||
p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
|
||||
@@ -328,9 +328,9 @@ void PASTEMAC(ch,varname)( \
|
||||
the case of 3m, we need to scale by 3/2. We break up this scaling
|
||||
factor into numerator and denominator since it cannot be represented
|
||||
by a single integer. */ \
|
||||
if ( bli_is_3m_packed( schema ) ) { ss_num = 3*ldp; \
|
||||
if ( bli_is_3m_packed( schema ) ) { ss_num = 3; \
|
||||
ss_den = 2; } \
|
||||
else { ss_num = 1*ldp; \
|
||||
else { ss_num = 1; \
|
||||
ss_den = 1; } \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
@@ -442,7 +442,12 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = ( panel_len_max_i * ss_num ) / ss_den; \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the panel increment up by one if it is odd. */ \
|
||||
p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \
|
||||
\
|
||||
p_inc = ( p_inc * ss_num ) / ss_den; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
@@ -511,6 +516,28 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( bli_is_ro_packed( schema ) ) { \
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
if ( row_stored && *n_panel_use == 3 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
} \
|
||||
*/ \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
|
||||
@@ -173,6 +173,7 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
obj_t* p )
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
num_t dt_real = bli_obj_datatype_proj_to_real( *c );
|
||||
trans_t transc = bli_obj_onlytrans_status( *c );
|
||||
dim_t m_c = bli_obj_length( *c );
|
||||
dim_t n_c = bli_obj_width( *c );
|
||||
@@ -344,6 +345,13 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
// dimension of the matrix is not a whole multiple of MR.
|
||||
ps_p = cs_p * n_p_pad;
|
||||
|
||||
// As a general rule, we don't want panel strides to be odd. This
|
||||
// is primarily motivated by our desire to support interleaved 3m
|
||||
// micro-panels, in which case we have to scale the panel stride
|
||||
// by 3/2. That division by 2 means the numerator (prior to being
|
||||
// scaled by 3) must be even.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Query the micro-panel alignment for A.
|
||||
upanel_a_align = bli_blksz_for_type( dt, gemm_upanel_a_align );
|
||||
|
||||
@@ -365,9 +373,29 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
bli_is_io_packed( pack_schema ) ||
|
||||
bli_is_rpi_packed( pack_schema ) )
|
||||
{
|
||||
// Align the panel stride according to the micro-panel alignment.
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_a_align );
|
||||
// Acquire the element size of the the real projection of the
|
||||
// current complex datatype.
|
||||
siz_t elem_size_p_real = elem_size_p / 2;
|
||||
|
||||
// Acquire the micro-panel alignment for the real projection of
|
||||
// the current complex datatype.
|
||||
upanel_a_align = bli_blksz_for_type( dt_real, gemm_upanel_a_align );
|
||||
|
||||
// Align the panel stride according to the micro-panel alignment.
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p_real, upanel_a_align );
|
||||
|
||||
// The division by 2 below assumes that ps_p is an even number.
|
||||
// However, it is possible that, at this point, ps_p is an odd.
|
||||
// If it is indeed odd, we nudge it higher.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Despite the fact that the packed micro-panels will contain
|
||||
// real elements, the panel stride that we store in the obj_t
|
||||
// (which is passed into the macro-kernel) needs to be in units
|
||||
// of complex elements, since the macro-kernel will index through
|
||||
// micro-panels via complex pointer arithmetic for trmm/trsm.
|
||||
// Since the indexing "increment" will be twice as large as each
|
||||
// actual stored element, we divide the panel_stride by 2.
|
||||
ps_p = ps_p / 2;
|
||||
}
|
||||
else
|
||||
@@ -415,6 +443,13 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
// dimension of the matrix is not a whole multiple of NR.
|
||||
ps_p = m_p_pad * rs_p;
|
||||
|
||||
// As a general rule, we don't want panel strides to be odd. This
|
||||
// is primarily motivated by our desire to support interleaved 3m
|
||||
// micro-panels, in which case we have to scale the panel stride
|
||||
// by 3/2. That division by 2 means the numerator (prior to being
|
||||
// scaled by 3) must be even.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Query the micro-panel alignment for B.
|
||||
upanel_b_align = bli_blksz_for_type( dt, gemm_upanel_b_align );
|
||||
|
||||
@@ -436,9 +471,29 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
bli_is_io_packed( pack_schema ) ||
|
||||
bli_is_rpi_packed( pack_schema ) )
|
||||
{
|
||||
// Align the panel stride according to the micro-panel alignment.
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_b_align );
|
||||
// Acquire the element size of the the real projection of the
|
||||
// current complex datatype.
|
||||
siz_t elem_size_p_real = elem_size_p / 2;
|
||||
|
||||
// Acquire the micro-panel alignment for the real projection of
|
||||
// the current complex datatype.
|
||||
upanel_b_align = bli_blksz_for_type( dt_real, gemm_upanel_b_align );
|
||||
|
||||
// Align the panel stride according to the micro-panel alignment.
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p_real, upanel_b_align );
|
||||
|
||||
// The division by 2 below assumes that ps_p is an even number.
|
||||
// However, it is possible that, at this point, ps_p is an odd.
|
||||
// If it is indeed odd, we nudge it higher.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Despite the fact that the packed micro-panels will contain
|
||||
// real elements, the panel stride that we store in the obj_t
|
||||
// (which is passed into the macro-kernel) needs to be in units
|
||||
// of complex elements, since the macro-kernel will index through
|
||||
// micro-panels via complex pointer arithmetic for trmm/trsm.
|
||||
// Since the indexing "increment" will be twice as large as each
|
||||
// actual stored element, we divide the panel_stride by 2.
|
||||
ps_p = ps_p / 2;
|
||||
}
|
||||
else
|
||||
|
||||
@@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1*PACKMR; \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
@@ -348,7 +351,9 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
ps_a_cur = ( k_a1011 * ss_a_num ) / ss_a_den; \
|
||||
ps_a_cur = k_a1011 * PACKMR; \
|
||||
ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
@@ -409,10 +414,6 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
\
|
||||
/*
|
||||
printf( "bli_trmm_ll_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \
|
||||
*/ \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
@@ -470,9 +471,6 @@ printf( "bli_trmm_ll_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
/*
|
||||
printf( "bli_trmm_ll_ker_var2: applying rstep_a = %lu\n", rstep_a ); \
|
||||
*/ \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
|
||||
@@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1*PACKMR; \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
@@ -355,7 +358,9 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
ps_a_cur = ( k_a1112 * ss_a_num ) / ss_a_den; \
|
||||
ps_a_cur = k_a1112 * PACKMR; \
|
||||
ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
@@ -415,9 +420,6 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
printf( "bli_trmm_lu_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \
|
||||
*/ \
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
@@ -475,9 +477,6 @@ printf( "bli_trmm_lu_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
printf( "bli_trmm_lu_ker_var2: applying rstep_a = %lu\n", rstep_a ); \
|
||||
*/ \
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1*PACKNR; \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
@@ -345,7 +348,9 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
|
||||
ps_b_cur = k_b1121 * PACKNR; \
|
||||
ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
|
||||
@@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1*PACKNR; \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
@@ -345,7 +348,9 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
ps_b_cur = ( k_b0111 * ss_b_num ) / ss_b_den; \
|
||||
ps_b_cur = k_b0111 * PACKNR; \
|
||||
ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
|
||||
@@ -225,6 +225,11 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -255,14 +260,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1*PACKMR; \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
@@ -378,7 +381,9 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
ps_a_cur = ( k_a1011 * ss_a_num ) / ss_a_den; \
|
||||
ps_a_cur = k_a1011 * PACKMR; \
|
||||
ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* Compute the addresses of the panel A10 and the triangular
|
||||
block A11. */ \
|
||||
|
||||
@@ -226,6 +226,11 @@ void PASTEMAC(ch,varname)( \
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -256,14 +261,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1*PACKMR; \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
@@ -388,7 +391,9 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
ps_a_cur = ( k_a1112 * ss_a_num ) / ss_a_den; \
|
||||
ps_a_cur = k_a1112 * PACKMR; \
|
||||
ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* Compute the addresses of the triangular block A11 and the
|
||||
panel A12. */ \
|
||||
|
||||
@@ -234,6 +234,11 @@ void PASTEMAC(ch,varname)( \
|
||||
needs to be packed with MR (remember: B is the triangular matrix in
|
||||
the right-hand side parameter case).
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -264,14 +269,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1*PACKNR; \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
@@ -398,7 +401,9 @@ void PASTEMAC(ch,varname)( \
|
||||
b21 = b1 + ( k_b11 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the panel stride for the current micro-panel. */ \
|
||||
ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
|
||||
ps_b_cur = k_b1121 * PACKNR; \
|
||||
ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
|
||||
@@ -233,6 +233,11 @@ void PASTEMAC(ch,varname)( \
|
||||
needs to be packed with MR (remember: B is the triangular matrix in
|
||||
the right-hand side parameter case).
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
@@ -263,14 +268,12 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just PACKMR (for A
|
||||
or PACKNR (for B). However, in the case of 3m, we need to scale
|
||||
the offset by 3/2. Since it's possible we may need to scale
|
||||
the packing dimension by a non-integer value, we break up the
|
||||
scaling factor into numerator and denominator. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1*PACKNR; \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
@@ -391,7 +394,9 @@ void PASTEMAC(ch,varname)( \
|
||||
b11 = b1 + ( k_b01 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the panel stride for the current micro-panel. */ \
|
||||
ps_b_cur = ( k_b0111 * ss_b_num ) / ss_b_den; \
|
||||
ps_b_cur = k_b0111 * PACKNR; \
|
||||
ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
|
||||
@@ -248,6 +248,11 @@
|
||||
#define bli_isinf( a ) isinf( a )
|
||||
#define bli_isnan( a ) isnan( a )
|
||||
|
||||
// is_odd, is_even
|
||||
|
||||
#define bli_is_odd( a ) ( a % 2 == 1 )
|
||||
#define bli_is_even( a ) ( a % 2 == 0 )
|
||||
|
||||
// swap_types
|
||||
|
||||
#define bli_swap_types( type1, type2 ) \
|
||||
|
||||
Reference in New Issue
Block a user