From 518a1756ccf02122b96fc437b538604a597df42a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 19 Feb 2015 14:27:09 -0600 Subject: [PATCH] Fixed indexing bug for trmm3 via 3mh, 4mh. Details: - Fixed a bug that only affected trmm3 when performed via 3mh or 4mh, whereby micro-panels of the triangular matrix were packed with "dead space" between them due to failing to adjust for the fact that pointer arithmetic was occurring in units of complex elements while the data being packed consisted of real elements. It turns out that the macro- kernel suffered from the same bug, meaning the panels were actually being packed and read consistently. The only way I was able to discover the bug in the first place was because the packed block of A was overflowing into the beginning of the packed row panel of B using the sandybridge configuration. --- frame/1m/packm/bli_packm_blk_var2.c | 17 +++++++++-------- frame/3/trmm/bli_trmm_ll_ker_var2.c | 12 +++++++----- frame/3/trmm/bli_trmm_lu_ker_var2.c | 12 +++++++----- frame/3/trmm/bli_trmm_rl_ker_var2.c | 12 +++++++----- frame/3/trmm/bli_trmm_ru_ker_var2.c | 12 +++++++----- frame/3/trsm/bli_trsm_ll_ker_var2.c | 11 ++++++----- frame/3/trsm/bli_trsm_lu_ker_var2.c | 11 ++++++----- frame/3/trsm/bli_trsm_rl_ker_var2.c | 11 ++++++----- frame/3/trsm/bli_trsm_ru_ker_var2.c | 11 ++++++----- 9 files changed, 61 insertions(+), 48 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index e0f6b9f26..838f70aef 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -324,14 +324,15 @@ void PASTEMAC(ch,varname)( \ n_panel_max = &panel_len_max_i; \ } \ \ - /* Compute the storage stride. Usually this is just ldp. However, in - the case of 3m, we need to scale by 3/2. We break up this scaling - factor into numerator and denominator since it cannot be represented - by a single integer. */ \ - if ( bli_is_3m_packed( schema ) ) { ss_num = 3; \ - ss_den = 2; } \ - else { ss_num = 1; \ - ss_den = 1; } \ + /* Compute the storage stride scaling. Usually this is just 1. However, + in the case of interleaved 3m, we need to scale by 3/2, and in the + cases of real-only, imag-only, or summed-only, we need to scale by + 1/2. In both cases, we are compensating for the fact that pointer + arithmetic occurs in terms of complex elements rather than real + elements. */ \ + if ( bli_is_3m_packed( schema ) ) { ss_num = 3; ss_den = 2; } \ + else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ + else { ss_num = 1; ss_den = 1; } \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 3f2098b73..43b81e9f9 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 7ab185447..653e20d43 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 531eacb7d..2cadebba0 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index f4687939a..64e5a453c 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index d9ae541e9..bd066d627 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -262,11 +262,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 725b7618e..67ae3d55f 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -263,11 +263,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 9dacf31a5..bdee5b4d1 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -271,11 +271,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index f861c8045..e65f946e7 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -270,11 +270,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and