diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index e0f6b9f26..838f70aef 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -324,14 +324,15 @@ void PASTEMAC(ch,varname)( \ n_panel_max = &panel_len_max_i; \ } \ \ - /* Compute the storage stride. Usually this is just ldp. However, in - the case of 3m, we need to scale by 3/2. We break up this scaling - factor into numerator and denominator since it cannot be represented - by a single integer. */ \ - if ( bli_is_3m_packed( schema ) ) { ss_num = 3; \ - ss_den = 2; } \ - else { ss_num = 1; \ - ss_den = 1; } \ + /* Compute the storage stride scaling. Usually this is just 1. However, + in the case of interleaved 3m, we need to scale by 3/2, and in the + cases of real-only, imag-only, or summed-only, we need to scale by + 1/2. In both cases, we are compensating for the fact that pointer + arithmetic occurs in terms of complex elements rather than real + elements. */ \ + if ( bli_is_3m_packed( schema ) ) { ss_num = 3; ss_den = 2; } \ + else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \ + else { ss_num = 1; ss_den = 1; } \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 3f2098b73..43b81e9f9 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 7ab185447..653e20d43 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 531eacb7d..2cadebba0 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index f4687939a..64e5a453c 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -250,11 +250,13 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. And if we are packing real-only, imag-only, or + summed-only, we need to scale the computed panel sizes by 1/2 + to compensate for the fact that the pointer arithmetic occurs + in terms of complex elements rather than real elements. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index d9ae541e9..bd066d627 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -262,11 +262,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 725b7618e..67ae3d55f 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -263,11 +263,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ - ss_a_den = 2; } \ - else { ss_a_num = 1; \ - ss_a_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ + else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 9dacf31a5..bdee5b4d1 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -271,11 +271,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index f861c8045..e65f946e7 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -270,11 +270,12 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the - offset by 3/2. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ - ss_b_den = 2; } \ - else { ss_b_num = 1; \ - ss_b_den = 1; } \ + offset by 3/2. Note that real-only, imag-only, and summed-only + packing formats are not applicable here since trsm is a two- + operand operation only (unlike trmm, which is capable of three- + operand). */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ + else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and