Fixed indexing bug for trmm3 via 3mh, 4mh.

Details:
- Fixed a bug that only affected trmm3 when performed via 3mh or 4mh,
  whereby micro-panels of the triangular matrix were packed with "dead
  space" between them due to failing to adjust for the fact that pointer
  arithmetic was occurring in units of complex elements while the data
  being packed consisted of real elements. It turns out that the macro-
  kernel suffered from the same bug, meaning the panels were actually
  being packed and read consistently. The only way I was able to
  discover the bug in the first place was because the packed block of A
  was overflowing into the beginning of the packed row panel of B using
  the sandybridge configuration.
This commit is contained in:
Field G. Van Zee
2015-02-19 14:27:09 -06:00
parent 493087d730
commit 518a1756cc
9 changed files with 61 additions and 48 deletions

View File

@@ -324,14 +324,15 @@ void PASTEMAC(ch,varname)( \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride. Usually this is just ldp. However, in
the case of 3m, we need to scale by 3/2. We break up this scaling
factor into numerator and denominator since it cannot be represented
by a single integer. */ \
if ( bli_is_3m_packed( schema ) ) { ss_num = 3; \
ss_den = 2; } \
else { ss_num = 1; \
ss_den = 1; } \
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3m_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \

View File

@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
ss_a_den = 2; } \
else { ss_a_num = 1; \
ss_a_den = 1; } \
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as

View File

@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
ss_a_den = 2; } \
else { ss_a_num = 1; \
ss_a_den = 1; } \
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and

View File

@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
ss_b_den = 2; } \
else { ss_b_num = 1; \
ss_b_den = 1; } \
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this

View File

@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
ss_b_den = 2; } \
else { ss_b_num = 1; \
ss_b_den = 1; } \
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and

View File

@@ -262,11 +262,12 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
ss_a_den = 2; } \
else { ss_a_num = 1; \
ss_a_den = 1; } \
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as

View File

@@ -263,11 +263,12 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
ss_a_den = 2; } \
else { ss_a_num = 1; \
ss_a_den = 1; } \
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and

View File

@@ -271,11 +271,12 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
ss_b_den = 2; } \
else { ss_b_num = 1; \
ss_b_den = 1; } \
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this

View File

@@ -270,11 +270,12 @@ void PASTEMAC(ch,varname)( \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
ss_b_den = 2; } \
else { ss_b_num = 1; \
ss_b_den = 1; } \
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and