diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index e8c700bb7..c53c8e6bc 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -256,6 +256,7 @@ void PASTEMAC(ch,varname)( \ conj_t conjc; \ bool_t row_stored; \ bool_t col_stored; \ + inc_t is_p_use; \ dim_t ss_num; \ dim_t ss_den; \ \ @@ -427,6 +428,14 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ +\ + /* We need to re-compute the imaginary stride as a function of + panel_len_max_i since triangular packed matrices have panels + of varying lengths. */ \ + is_p_use = ldp * panel_len_max_i; \ +\ + /* We nudge the imaginary stride up by one if it is odd. */ \ + is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -444,18 +453,17 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p, \ - is_p ); \ + is_p_use ); \ } \ \ /* NOTE: This value is usually LESS than ps_p because triangular matrices usually have several micro-panels that are shorter than a "full" micro-panel. */ \ +/* p_inc = ldp * panel_len_max_i; \ -\ - /* We nudge the panel increment up by one if it is odd. */ \ p_inc += ( bli_is_odd( p_inc ) ? 1 : 0 ); \ -\ - p_inc = ( p_inc * ss_num ) / ss_den; \ +*/ \ + p_inc = ( is_p_use * ss_num ) / ss_den; \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ @@ -468,6 +476,8 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ +\ + is_p_use = is_p; \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -485,7 +495,7 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p, \ - is_p ); \ + is_p_use ); \ } \ \ /* NOTE: This value is equivalent to ps_p. */ \ @@ -502,6 +512,8 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ +\ + is_p_use = is_p; \ \ if( packm_thread_my_iter( it, thread ) ) \ { \ @@ -519,7 +531,7 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p, \ - is_p ); \ + is_p_use ); \ } \ \ /* NOTE: This value is equivalent to ps_p. */ \ @@ -527,21 +539,33 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ } \ \ /* - if ( bli_is_ro_packed( schema ) ) { \ + if ( bli_is_4mi_packed( schema ) ) { \ + printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ if ( col_stored ) { \ + if ( 0 ) \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \ ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ } \ - if ( row_stored && *n_panel_use == 3 ) { \ + if ( row_stored ) { \ + if ( 0 ) \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \ ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \ } \ } \ */ \ +/* +*/ \ +\ +/* +*/ \ /* PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_3mis.c b/frame/1m/packm/bli_packm_struc_cxk_3mis.c index 48de808d5..1ad507ec6 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_3mis.c +++ b/frame/1m/packm/bli_packm_struc_cxk_3mis.c @@ -57,7 +57,6 @@ void PASTEMAC(ch,varname)( \ { \ dim_t panel_dim; \ dim_t panel_len; \ - dim_t panel_len_max; \ inc_t incc, ldc; \ inc_t ldp; \ \ @@ -69,7 +68,6 @@ void PASTEMAC(ch,varname)( \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ panel_len = m_panel; \ - panel_len_max = m_panel_max; \ incc = cs_c; \ ldc = rs_c; \ ldp = rs_p; \ @@ -79,7 +77,6 @@ void PASTEMAC(ch,varname)( \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ panel_len = n_panel; \ - panel_len_max = n_panel_max; \ incc = rs_c; \ ldc = cs_c; \ ldp = cs_p; \ @@ -122,11 +119,6 @@ void PASTEMAC(ch,varname)( \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ { \ - /* We need to re-compute the imaginary stride as a function of - panel_len_max since triangular packed matrices have panels - of varying lengths. */ \ - is_p = ldp * panel_len_max; \ -\ /* Call a helper function for micro-panels of triangular matrices. */ \ PASTEMAC(ch,packm_tri_cxk_3mis)( strucc, \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_4mi.c b/frame/1m/packm/bli_packm_struc_cxk_4mi.c index aa5babfe7..e84ff23e4 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_4mi.c +++ b/frame/1m/packm/bli_packm_struc_cxk_4mi.c @@ -57,7 +57,6 @@ void PASTEMAC(ch,varname)( \ { \ dim_t panel_dim; \ dim_t panel_len; \ - dim_t panel_len_max; \ inc_t incc, ldc; \ inc_t ldp; \ \ @@ -69,7 +68,6 @@ void PASTEMAC(ch,varname)( \ /* Prepare to pack to row-stored column panel. */ \ panel_dim = n_panel; \ panel_len = m_panel; \ - panel_len_max = m_panel_max; \ incc = cs_c; \ ldc = rs_c; \ ldp = rs_p; \ @@ -79,7 +77,6 @@ void PASTEMAC(ch,varname)( \ /* Prepare to pack to column-stored row panel. */ \ panel_dim = m_panel; \ panel_len = n_panel; \ - panel_len_max = n_panel_max; \ incc = rs_c; \ ldc = cs_c; \ ldp = cs_p; \ @@ -122,11 +119,6 @@ void PASTEMAC(ch,varname)( \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ { \ - /* We need to re-compute the imaginary stride as a function of - panel_len_max since triangular packed matrices have panels - of varying lengths. */ \ - is_p = ldp * panel_len_max; \ -\ /* Call a helper function for micro-panels of triangular matrices. */ \ PASTEMAC(ch,packm_tri_cxk_4mi)( strucc, \ diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 67594a1c2..70bf338f2 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -96,6 +96,9 @@ void bli_trmm_front( side_t side, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. + // NOTE: We disable the optimization for 1x1 matrices since the concept + // of row- vs. column storage breaks down. + if ( !bli_obj_is_1x1( c_local ) ) if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index acb89dc9d..a50b05dc4 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -198,6 +198,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ + inc_t is_a_cur; \ auxinfo_t aux; \ \ /* @@ -297,6 +298,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, aux ); \ @@ -353,9 +357,9 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = k_a1011 * PACKMR; \ - ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ - ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ @@ -378,7 +382,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( PACKMR * k_a1011, aux ); \ + bli_auxinfo_set_is_a( is_a_cur, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 2b23fbbab..35a2cefe2 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -198,6 +198,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ + inc_t is_a_cur; \ auxinfo_t aux; \ \ /* @@ -304,6 +305,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, aux ); \ @@ -360,9 +364,9 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = k_a1112 * PACKMR; \ - ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ - ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ @@ -385,7 +389,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( PACKMR * k_a1112, aux ); \ + bli_auxinfo_set_is_a( is_a_cur, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 550c9a232..941f7a7f2 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -198,6 +198,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ + inc_t is_b_cur; \ auxinfo_t aux; \ \ /* @@ -304,6 +305,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, aux ); \ @@ -350,15 +354,15 @@ void PASTEMAC(ch,varname)( \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_b_cur = k_b1121 * PACKNR; \ - ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ - ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_b( PACKNR * k_b1121, aux ); \ + bli_auxinfo_set_is_b( is_b_cur, aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 3e4d8dabc..6d7127f6f 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -198,6 +198,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ + inc_t is_b_cur; \ auxinfo_t aux; \ \ /* @@ -305,6 +306,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, aux ); \ @@ -350,15 +354,15 @@ void PASTEMAC(ch,varname)( \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_b_cur = k_b0111 * PACKNR; \ - ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ - ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_b( PACKNR * k_b0111, aux ); \ + bli_auxinfo_set_is_b( is_b_cur, aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 957f6f78a..31384e1fa 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -210,6 +210,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ + inc_t is_a_cur; \ auxinfo_t aux; \ \ /* @@ -325,6 +326,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, aux ); \ @@ -382,14 +386,15 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = k_a1011 * PACKMR; \ - ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ - ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ + is_a_cur = k_a1011 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the panel A10 and the triangular block A11. */ \ a10 = a1; \ - a11 = a1 + ( k_a10 * PACKMR ) / off_scl; \ + /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ + bli_ptr_add( a11, a1, k_a10 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ @@ -414,7 +419,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( PACKMR * k_a1011, aux ); \ + bli_auxinfo_set_is_a( is_a_cur, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -515,6 +520,44 @@ void PASTEMAC(ch,varname)( \ } \ \ /* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ +*/ \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ + ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ +*/ \ +\ +/* +if ( bli_is_4mi_packed( schema_a ) ){ \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ + ( double* )b, rs_b, 1, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ + ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ +}else{ \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ + ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ + ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ +} \ + +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ + ( double* )c, 1, cs_c, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ + ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ +*/ \ +\ +/* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 9270edda6..0e6129134 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -211,6 +211,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ + inc_t is_a_cur; \ auxinfo_t aux; \ \ /* @@ -333,6 +334,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, aux ); \ @@ -392,14 +396,15 @@ void PASTEMAC(ch,varname)( \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ - ps_a_cur = k_a1112 * PACKMR; \ - ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); \ - ps_a_cur = ( ps_a_cur * ss_a_num ) / ss_a_den; \ + is_a_cur = k_a1112 * PACKMR; \ + is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ + ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the triangular block A11 and the panel A12. */ \ a11 = a1; \ - a12 = a1 + ( k_a11 * PACKMR ) / off_scl; \ + /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ + bli_ptr_add( a12, a1, k_a11 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ @@ -424,7 +429,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( PACKMR * k_a1112, aux ); \ + bli_auxinfo_set_is_a( is_a_cur, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index b90fd19d2..7a8e97490 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -211,6 +211,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ + inc_t is_b_cur; \ auxinfo_t aux; \ \ /* @@ -346,6 +347,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular @@ -399,18 +403,19 @@ void PASTEMAC(ch,varname)( \ /* Compute the addresses of the triangular block B11 and the panel B21. */ \ b11 = b1; \ - b21 = b1 + ( k_b11 * PACKNR ) / off_scl; \ + /* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \ + bli_ptr_add( b21, b1, k_b11 * PACKNR, off_scl ); \ \ /* Compute the panel stride for the current micro-panel. */ \ - ps_b_cur = k_b1121 * PACKNR; \ - ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ - ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ + is_b_cur = k_b1121 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_is_a( PACKNR * k_b1121, aux ); \ + bli_auxinfo_set_is_a( is_b_cur, aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 86187be20..bd66d654e 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -210,6 +210,7 @@ void PASTEMAC(ch,varname)( \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ + inc_t is_b_cur; \ auxinfo_t aux; \ \ /* @@ -341,6 +342,9 @@ void PASTEMAC(ch,varname)( \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ +\ + if ( bli_is_odd( istep_a ) ) istep_a += 1; \ + if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular @@ -392,18 +396,19 @@ void PASTEMAC(ch,varname)( \ /* Compute the addresses of the panel B10 and the triangular block B11. */ \ b01 = b1; \ - b11 = b1 + ( k_b01 * PACKNR ) / off_scl; \ + /* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \ + bli_ptr_add( b11, b1, k_b01 * PACKNR, off_scl ); \ \ /* Compute the panel stride for the current micro-panel. */ \ - ps_b_cur = k_b0111 * PACKNR; \ - ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); \ - ps_b_cur = ( ps_b_cur * ss_b_num ) / ss_b_den; \ + is_b_cur = k_b0111 * PACKNR; \ + is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ + ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_is_a( PACKNR * k_b0111, aux ); \ + bli_auxinfo_set_is_a( is_b_cur, aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.c b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.c index cdd6ad459..37f0aa3cd 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.c +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.c @@ -69,6 +69,13 @@ void PASTEMAC(ch,varname)( \ b11, \ c11, rs_c, cs_c, \ data ); \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_l_ukr: b0111p_r after", k+3, 8, \ + ( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_l_ukr: b0111p_i after", k+3, 8, \ + ( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ +*/ \ } INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ukr_ref, GEMM_UKERNEL, TRSM_L_UKERNEL ) diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 907a07d83..95c858002 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -49,6 +49,17 @@ #endif +// -- Define typeof() operator if using non-GNU compiler -- + +#ifndef __GNUC__ + #define typeof __typeof__ +#else + #ifndef typeof + #define typeof __typeof__ + #endif +#endif + + // -- Boolean values -- #ifndef TRUE diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 1b1040b1f..c2e05ccac 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -653,6 +653,20 @@ bli_is_rpi_packed( schema ) ) +// pointer-related + +// p1 = p0 + (num/dem) +#define bli_ptr_add( p1, p0, num, dem ) \ +{ \ + p1 = ( typeof( p1 ) ) \ + ( ( char* )(p0) + ( ( (num) * sizeof( *(p0) ) \ + ) / (dem) \ + ) \ + ); \ +} + + + // return datatype for char diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/bli_gemmtrsm4m1_l_ukr_ref.c index f73106485..d4d885c43 100644 --- a/frame/ind/ukernels/trsm/bli_gemmtrsm4m1_l_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_gemmtrsm4m1_l_ukr_ref.c @@ -79,6 +79,17 @@ void PASTEMAC(ch,varname)( \ \ dim_t i, j; \ \ +/* +printf( "gemmtrsm4m1_l_ukr: is_a = %lu is_b = %lu\n", is_a, is_b ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_r", m, k+m, \ + a10_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_i", m, k+m, \ + a10_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r", k+m, n, \ + b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i", k+m, n, \ + b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +*/ \ \ /* Copy the contents of c to a temporary buffer ct. */ \ if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ @@ -99,7 +110,7 @@ void PASTEMAC(ch,varname)( \ \ \ /* b11.r = alpha.r * b11.r - ( a10.r * b01.r - a10.i * b01.i ); - b11.i = alpha.r * b11.r - ( a10.r * b01.i + a10.i * b01.r ); */ \ + b11.i = alpha.r * b11.i - ( a10.r * b01.i + a10.i * b01.r ); */ \ \ bli_auxinfo_set_next_ab( a10_r, b01_i, *data ); \ \ @@ -144,7 +155,12 @@ void PASTEMAC(ch,varname)( \ one_r, \ b11_r, rs_b, cs_b, \ data ); \ -\ +/* +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r post-gemm", k+m, n, \ + b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i post-gemm", k+m, n, \ + b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +*/ \ \ /* b11 = inv(a11) * b11; c11 = b11; */ \ @@ -152,6 +168,13 @@ void PASTEMAC(ch,varname)( \ b11_r, \ c11, rs_c, cs_c, \ data ); \ +\ +/* +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r after", k+m, n, \ + b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i after", k+m, n, \ + b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +*/ \ } INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m1_l_ukr_ref, GEMM_UKERNEL, TRSM4M1_L_UKERNEL ) diff --git a/frame/ind/ukernels/trsm/bli_trsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/bli_trsm4m1_l_ukr_ref.c index d03ef40a4..e11faabe6 100644 --- a/frame/ind/ukernels/trsm/bli_trsm4m1_l_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_trsm4m1_l_ukr_ref.c @@ -66,6 +66,16 @@ void PASTEMAC(ch,varname)( \ dim_t iter, i, j, l; \ dim_t n_behind; \ \ +/* +PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_r", m, m, \ + a_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_i", m, m, \ + a_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r", m, n, \ + b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i", m, n, \ + b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +*/ \ \ for ( iter = 0; iter < m; ++iter ) \ { \ @@ -136,6 +146,13 @@ void PASTEMAC(ch,varname)( \ PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ } \ } \ +\ +/* +PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r after", m, n, \ + b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i after", m, n, \ + b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +*/ \ } INSERT_GENTFUNCCO_BASIC0( trsm4m1_l_ukr_ref )