mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Fixed indexing bug for trmm3 via 3mh, 4mh.
Details: - Fixed a bug that only affected trmm3 when performed via 3mh or 4mh, whereby micro-panels of the triangular matrix were packed with "dead space" between them due to failing to adjust for the fact that pointer arithmetic was occurring in units of complex elements while the data being packed consisted of real elements. It turns out that the macro- kernel suffered from the same bug, meaning the panels were actually being packed and read consistently. The only way I was able to discover the bug in the first place was because the packed block of A was overflowing into the beginning of the packed row panel of B using the sandybridge configuration.
This commit is contained in:
@@ -324,14 +324,15 @@ void PASTEMAC(ch,varname)( \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the storage stride. Usually this is just ldp. However, in
|
||||
the case of 3m, we need to scale by 3/2. We break up this scaling
|
||||
factor into numerator and denominator since it cannot be represented
|
||||
by a single integer. */ \
|
||||
if ( bli_is_3m_packed( schema ) ) { ss_num = 3; \
|
||||
ss_den = 2; } \
|
||||
else { ss_num = 1; \
|
||||
ss_den = 1; } \
|
||||
/* Compute the storage stride scaling. Usually this is just 1. However,
|
||||
in the case of interleaved 3m, we need to scale by 3/2, and in the
|
||||
cases of real-only, imag-only, or summed-only, we need to scale by
|
||||
1/2. In both cases, we are compensating for the fact that pointer
|
||||
arithmetic occurs in terms of complex elements rather than real
|
||||
elements. */ \
|
||||
if ( bli_is_3m_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
|
||||
else { ss_num = 1; ss_den = 1; } \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
|
||||
@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
left edge of the block, adjust the pointer to C and treat this case as
|
||||
|
||||
@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
intersects the top edge of the block, adjust the pointer to B and
|
||||
|
||||
@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
the left edge of the panel, adjust the pointer to A and treat this
|
||||
|
||||
@@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
intersects the top edge of the panel, adjust the pointer to C and
|
||||
|
||||
@@ -262,11 +262,12 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
left edge of the block, adjust the pointer to C and treat this case as
|
||||
|
||||
@@ -263,11 +263,12 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \
|
||||
ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; \
|
||||
ss_a_den = 1; } \
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
intersects the top edge of the block, adjust the pointer to B and
|
||||
|
||||
@@ -271,11 +271,12 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
the left edge of the panel, adjust the pointer to A and treat this
|
||||
|
||||
@@ -270,11 +270,12 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \
|
||||
ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; \
|
||||
ss_b_den = 1; } \
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
intersects the top edge of the panel, adjust the pointer to C and
|
||||
|
||||
Reference in New Issue
Block a user