diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 6d64ed8d8..8d2289dff 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -374,6 +374,9 @@ void PASTEMAC(ch,varname)( \ matrices usually have several micro-panels that are shorter than a "full" micro-panel. */ \ p_inc = ldp * panel_len_max_i; \ +\ + /* We nudge the panel increment up by one if it is odd. */ \ + p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index 6bffc512c..e0f6b9f26 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -328,9 +328,9 @@ void PASTEMAC(ch,varname)( \ the case of 3m, we need to scale by 3/2. We break up this scaling factor into numerator and denominator since it cannot be represented by a single integer. */ \ - if ( bli_is_3m_packed( schema ) ) { ss_num = 3*ldp; \ + if ( bli_is_3m_packed( schema ) ) { ss_num = 3; \ ss_den = 2; } \ - else { ss_num = 1*ldp; \ + else { ss_num = 1; \ ss_den = 1; } \ \ /* Compute the total number of iterations we'll need. */ \ @@ -442,7 +442,12 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ /* NOTE: This value is usually LESS than ps_p because triangular matrices usually have several micro-panels that are shorter than a "full" micro-panel. */ \ - p_inc = ( panel_len_max_i * ss_num ) / ss_den; \ + p_inc = ldp * panel_len_max_i; \ +\ + /* We nudge the panel increment up by one if it is odd. */ \ + p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \ +\ + p_inc = ( p_inc * ss_num ) / ss_den; \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ @@ -511,6 +516,28 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ p_inc = ps_p; \ } \ \ +/* + if ( bli_is_ro_packed( schema ) ) { \ + if ( col_stored ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \ + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + } \ + if ( row_stored && *n_panel_use == 3 ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \ + ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + } \ + } \ +*/ \ +/* + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ +\ +\ /* if ( row_stored ) { \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 6e4723076..640a57ab4 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -173,6 +173,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, obj_t* p ) { num_t dt = bli_obj_datatype( *c ); + num_t dt_real = bli_obj_datatype_proj_to_real( *c ); trans_t transc = bli_obj_onlytrans_status( *c ); dim_t m_c = bli_obj_length( *c ); dim_t n_c = bli_obj_width( *c ); @@ -344,6 +345,13 @@ void bli_packm_init_pack( invdiag_t invert_diag, // dimension of the matrix is not a whole multiple of MR. ps_p = cs_p * n_p_pad; + // As a general rule, we don't want panel strides to be odd. This + // is primarily motivated by our desire to support interleaved 3m + // micro-panels, in which case we have to scale the panel stride + // by 3/2. That division by 2 means the numerator (prior to being + // scaled by 3) must be even. + if ( bli_is_odd( ps_p ) ) ps_p += 1; + // Query the micro-panel alignment for A. upanel_a_align = bli_blksz_for_type( dt, gemm_upanel_a_align ); @@ -365,9 +373,29 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_is_io_packed( pack_schema ) || bli_is_rpi_packed( pack_schema ) ) { - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_a_align ); + // Acquire the element size of the the real projection of the + // current complex datatype. + siz_t elem_size_p_real = elem_size_p / 2; + // Acquire the micro-panel alignment for the real projection of + // the current complex datatype. + upanel_a_align = bli_blksz_for_type( dt_real, gemm_upanel_a_align ); + + // Align the panel stride according to the micro-panel alignment. + ps_p = bli_align_dim_to_size( ps_p, elem_size_p_real, upanel_a_align ); + + // The division by 2 below assumes that ps_p is an even number. + // However, it is possible that, at this point, ps_p is an odd. + // If it is indeed odd, we nudge it higher. + if ( bli_is_odd( ps_p ) ) ps_p += 1; + + // Despite the fact that the packed micro-panels will contain + // real elements, the panel stride that we store in the obj_t + // (which is passed into the macro-kernel) needs to be in units + // of complex elements, since the macro-kernel will index through + // micro-panels via complex pointer arithmetic for trmm/trsm. + // Since the indexing "increment" will be twice as large as each + // actual stored element, we divide the panel_stride by 2. ps_p = ps_p / 2; } else @@ -415,6 +443,13 @@ void bli_packm_init_pack( invdiag_t invert_diag, // dimension of the matrix is not a whole multiple of NR. ps_p = m_p_pad * rs_p; + // As a general rule, we don't want panel strides to be odd. This + // is primarily motivated by our desire to support interleaved 3m + // micro-panels, in which case we have to scale the panel stride + // by 3/2. That division by 2 means the numerator (prior to being + // scaled by 3) must be even. + if ( bli_is_odd( ps_p ) ) ps_p += 1; + // Query the micro-panel alignment for B. upanel_b_align = bli_blksz_for_type( dt, gemm_upanel_b_align ); @@ -436,9 +471,29 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_is_io_packed( pack_schema ) || bli_is_rpi_packed( pack_schema ) ) { - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_b_align ); + // Acquire the element size of the the real projection of the + // current complex datatype. + siz_t elem_size_p_real = elem_size_p / 2; + // Acquire the micro-panel alignment for the real projection of + // the current complex datatype. + upanel_b_align = bli_blksz_for_type( dt_real, gemm_upanel_b_align ); + + // Align the panel stride according to the micro-panel alignment. + ps_p = bli_align_dim_to_size( ps_p, elem_size_p_real, upanel_b_align ); + + // The division by 2 below assumes that ps_p is an even number. + // However, it is possible that, at this point, ps_p is an odd. + // If it is indeed odd, we nudge it higher. + if ( bli_is_odd( ps_p ) ) ps_p += 1; + + // Despite the fact that the packed micro-panels will contain + // real elements, the panel stride that we store in the obj_t + // (which is passed into the macro-kernel) needs to be in units + // of complex elements, since the macro-kernel will index through + // micro-panels via complex pointer arithmetic for trmm/trsm. + // Since the indexing "increment" will be twice as large as each + // actual stored element, we divide the panel_stride by 2. ps_p = ps_p / 2; } else diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index f80b6e491..3f2098b73 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \ rs_c == (no assumptions) cs_c == (no assumptions) */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ ss_a_den = 2; } \ - else { ss_a_num = 1*PACKMR; \ + else { ss_a_num = 1; \ ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the @@ -348,7 +351,9 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = ( k_a1011 * ss_a_num ) / ss_a_den; \ + ps_a_cur = k_a1011 * PACKMR; \ + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ \ if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ @@ -409,10 +414,6 @@ void PASTEMAC(ch,varname)( \ } \ \ a1 += ps_a_cur; \ -\ -/* -printf( "bli_trmm_ll_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \ -*/ \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ @@ -470,9 +471,6 @@ printf( "bli_trmm_ll_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \ } \ \ a1 += rstep_a; \ -/* -printf( "bli_trmm_ll_ker_var2: applying rstep_a = %lu\n", rstep_a ); \ -*/ \ } \ \ c11 += rstep_c; \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 8b18918b3..7ab185447 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \ rs_c == (no assumptions) cs_c == (no assumptions) */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ ss_a_den = 2; } \ - else { ss_a_num = 1*PACKMR; \ + else { ss_a_num = 1; \ ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A @@ -355,7 +358,9 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = ( k_a1112 * ss_a_num ) / ss_a_den; \ + ps_a_cur = k_a1112 * PACKMR; \ + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ \ if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ @@ -415,9 +420,6 @@ void PASTEMAC(ch,varname)( \ } \ } \ \ -/* -printf( "bli_trmm_lu_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \ -*/ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ @@ -475,9 +477,6 @@ printf( "bli_trmm_lu_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \ } \ } \ \ -/* -printf( "bli_trmm_lu_ker_var2: applying rstep_a = %lu\n", rstep_a ); \ -*/ \ a1 += rstep_a; \ } \ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 5d7360fc2..531eacb7d 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \ rs_c == (no assumptions) cs_c == (no assumptions) */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ ss_b_den = 2; } \ - else { ss_b_num = 1*PACKNR; \ + else { ss_b_num = 1; \ ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects @@ -345,7 +348,9 @@ void PASTEMAC(ch,varname)( \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \ + ps_b_cur = k_b1121 * PACKNR; \ + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ \ if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 440c5a67c..f4687939a 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -213,6 +213,11 @@ void PASTEMAC(ch,varname)( \ rs_c == (no assumptions) cs_c == (no assumptions) */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -243,14 +248,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ ss_b_den = 2; } \ - else { ss_b_num = 1*PACKNR; \ + else { ss_b_num = 1; \ ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B @@ -345,7 +348,9 @@ void PASTEMAC(ch,varname)( \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_b_cur = ( k_b0111 * ss_b_num ) / ss_b_den; \ + ps_b_cur = k_b0111 * PACKNR; \ + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ \ if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 103d2a122..d9ae541e9 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -225,6 +225,11 @@ void PASTEMAC(ch,varname)( \ rs_c == (no assumptions) cs_c == (no assumptions) */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -255,14 +260,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ ss_a_den = 2; } \ - else { ss_a_num = 1*PACKMR; \ + else { ss_a_num = 1; \ ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the @@ -378,7 +381,9 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = ( k_a1011 * ss_a_num ) / ss_a_den; \ + ps_a_cur = k_a1011 * PACKMR; \ + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the panel A10 and the triangular block A11. */ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 519c285a7..725b7618e 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -226,6 +226,11 @@ void PASTEMAC(ch,varname)( \ rs_c == (no assumptions) cs_c == (no assumptions) */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -256,14 +261,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3; \ ss_a_den = 2; } \ - else { ss_a_num = 1*PACKMR; \ + else { ss_a_num = 1; \ ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A @@ -388,7 +391,9 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = ( k_a1112 * ss_a_num ) / ss_a_den; \ + ps_a_cur = k_a1112 * PACKMR; \ + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the triangular block A11 and the panel A12. */ \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 6e2592d13..9dacf31a5 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -234,6 +234,11 @@ void PASTEMAC(ch,varname)( \ needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -264,14 +269,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ ss_b_den = 2; } \ - else { ss_b_num = 1*PACKNR; \ + else { ss_b_num = 1; \ ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects @@ -398,7 +401,9 @@ void PASTEMAC(ch,varname)( \ b21 = b1 + ( k_b11 * PACKNR ) / off_scl; \ \ /* Compute the panel stride for the current micro-panel. */ \ - ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \ + ps_b_cur = k_b1121 * PACKNR; \ + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ \ /* Save the imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 56c60ccee..f861c8045 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -233,6 +233,11 @@ void PASTEMAC(ch,varname)( \ needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). */ \ +\ + /* Safety trap: Certain indexing within this macro-kernel does not + work as intended if both MR and NR are odd. */ \ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ @@ -263,14 +268,12 @@ void PASTEMAC(ch,varname)( \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ - /* Compute the storage stride. Usually this is just PACKMR (for A - or PACKNR (for B). However, in the case of 3m, we need to scale - the offset by 3/2. Since it's possible we may need to scale - the packing dimension by a non-integer value, we break up the - scaling factor into numerator and denominator. */ \ - if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \ + /* Compute the storage stride scaling. Usually this is just 1. + However, in the case of interleaved 3m, we need to scale the + offset by 3/2. */ \ + if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3; \ ss_b_den = 2; } \ - else { ss_b_num = 1*PACKNR; \ + else { ss_b_num = 1; \ ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B @@ -391,7 +394,9 @@ void PASTEMAC(ch,varname)( \ b11 = b1 + ( k_b01 * PACKNR ) / off_scl; \ \ /* Compute the panel stride for the current micro-panel. */ \ - ps_b_cur = ( k_b0111 * ss_b_num ) / ss_b_den; \ + ps_b_cur = k_b0111 * PACKNR; \ + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ \ /* Save the imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 832dd9f48..6669c0ab9 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -248,6 +248,11 @@ #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) +// is_odd, is_even + +#define bli_is_odd( a ) ( a % 2 == 1 ) +#define bli_is_even( a ) ( a % 2 == 0 ) + // swap_types #define bli_swap_types( type1, type2 ) \