From 5ca3863220e07972fcefc6682ddd3f6e54fe4a94 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 15:48:30 -0500 Subject: [PATCH] Fixed a trsm1m bug that affected right-side cases. Details: - Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result was nondeterministic behavior (usually segmentation faults) for certain problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c which explicitly directed the virtual gemm micro-kernel to use temporary space if the storage preference of the [real domain] gemm ukernel did not match the storage of the output matrix C. In the context of gemm, this handling is not needed because agreement between the storage pref and the matrix is guaranteed by a high-level optimization in BLIS. However, this optimization is not applied to trsm because the storage of C is not necessarily the same as the storage of the micro-panels of B--both of which are updated by the micro-kernel during a trsm operation. Thus, the guarantee of storage/preference agreement is not in place for trsm, which means we must handle that case within the virtual gemm micro-kernel. - Comment updates and a minor macro change to bli_trsm*_cntx_init() for 3m1, 4m1a, and 1m. --- frame/ind/cntx/bli_trsmind_cntx.c | 30 +++++++++++--------- frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c | 21 ++++++-------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index a13d0d05a..96f9add60 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -73,10 +73,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) cntx ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) @@ -122,10 +121,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) cntx ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) @@ -174,9 +172,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, + // BLIS_PACKED_COL_PANELS_1R, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { @@ -195,9 +195,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, + // BLIS_PACKED_COL_PANELS_1E, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); } } diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c index ff23a36f4..6279ab762 100644 --- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - /*const bool_t row_pref = !col_pref;*/ \ + const bool_t row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -94,24 +94,19 @@ void PASTEMAC(ch,varname) \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ -\ - /* Sanity check: These should never occur because storage/preference - agreement is handled at a higher level. */ \ - /* - if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ - */ \ -\ \ /* If beta has a non-zero imaginary component OR if c is stored with general stride, then we compute the alpha*a*b product into temporary storage and then accumulate that result into c afterwards. Note that the other two cases concerning disagreement between the storage of C - and the output preference of the micro-kernel, should never occur - (though we could handle them if they did occur). */ \ + and the output preference of the micro-kernel, should ONLY occur in + the context of trsm, whereby this virtual micro-kernel is called + directly from the trsm macro-kernel to update the micro-tile b11 + that exists within the packed row-panel of B. Indeed that is the + reason those cases MUST be explicitly handled. */ \ if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ - /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ + else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \