Fixed a trsm1m bug that affected right-side cases.

Details:
- Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result
  was nondeterministic behavior (usually segmentation faults) for certain
  problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The
  cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c
  which explicitly directed the virtual gemm micro-kernel to use temporary
  space if the storage preference of the [real domain] gemm ukernel did
  not match the storage of the output matrix C. In the context of gemm,
  this handling is not needed because agreement between the storage pref
  and the matrix is guaranteed by a high-level optimization in BLIS.
  However, this optimization is not applied to trsm because the storage
  of C is not necessarily the same as the storage of the micro-panels of
  B--both of which are updated by the micro-kernel during a trsm
  operation. Thus, the guarantee of storage/preference agreement is not
  in place for trsm, which means we must handle that case within the
  virtual gemm micro-kernel.
- Comment updates and a minor macro change to bli_trsm*_cntx_init() for
  3m1, 4m1a, and 1m.
This commit is contained in:
Field G. Van Zee
2017-05-02 15:48:30 -05:00
committed by prangana
parent 1af0b09f5c
commit 5ca3863220
2 changed files with 24 additions and 27 deletions

View File

@@ -73,10 +73,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx )
cntx
);
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI,
BLIS_PACKED_COL_PANELS_3MI,
cntx );
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
}
void bli_trsm3m1_cntx_finalize( cntx_t* cntx )
@@ -122,10 +121,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx )
cntx
);
// Set the pack_t schemas for native execution.
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI,
BLIS_PACKED_COL_PANELS_4MI,
cntx );
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
}
void bli_trsm4m1_cntx_finalize( cntx_t* cntx )
@@ -174,9 +172,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E,
BLIS_PACKED_COL_PANELS_1R,
cntx );
//bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E,
// BLIS_PACKED_COL_PANELS_1R,
// cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
}
else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
@@ -195,9 +195,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
);
// Set the pack_t schemas for the current induced method.
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R,
BLIS_PACKED_COL_PANELS_1E,
cntx );
//bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R,
// BLIS_PACKED_COL_PANELS_1E,
// cntx );
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
}
}

View File

@@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \
PASTECH(chr,gemm_ukr_ft) \
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
/*const bool_t row_pref = !col_pref;*/ \
const bool_t row_pref = !col_pref; \
\
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -94,24 +94,19 @@ void PASTEMAC(ch,varname) \
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
\
/* Sanity check: These should never occur because storage/preference
agreement is handled at a higher level. */ \
/*
if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
*/ \
\
\
/* If beta has a non-zero imaginary component OR if c is stored with
general stride, then we compute the alpha*a*b product into temporary
storage and then accumulate that result into c afterwards. Note that
the other two cases concerning disagreement between the storage of C
and the output preference of the micro-kernel, should never occur
(though we could handle them if they did occur). */ \
and the output preference of the micro-kernel, should ONLY occur in
the context of trsm, whereby this virtual micro-kernel is called
directly from the trsm macro-kernel to update the micro-tile b11
that exists within the packed row-panel of B. Indeed that is the
reason those cases MUST be explicitly handled. */ \
if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \
/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \
else using_ct = FALSE; \
\