mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Fixed a trsm1m bug that affected right-side cases.
Details:
- Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result
was nondeterministic behavior (usually segmentation faults) for certain
problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The
cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c
which explicitly directed the virtual gemm micro-kernel to use temporary
space if the storage preference of the [real domain] gemm ukernel did
not match the storage of the output matrix C. In the context of gemm,
this handling is not needed because agreement between the storage pref
and the matrix is guaranteed by a high-level optimization in BLIS.
However, this optimization is not applied to trsm because the storage
of C is not necessarily the same as the storage of the micro-panels of
B--both of which are updated by the micro-kernel during a trsm
operation. Thus, the guarantee of storage/preference agreement is not
in place for trsm, which means we must handle that case within the
virtual gemm micro-kernel.
- Comment updates and a minor macro change to bli_trsm*_cntx_init() for
3m1, 4m1a, and 1m.
This commit is contained in:
committed by
prangana
parent
1af0b09f5c
commit
5ca3863220
@@ -73,10 +73,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI,
|
||||
BLIS_PACKED_COL_PANELS_3MI,
|
||||
cntx );
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
|
||||
}
|
||||
|
||||
void bli_trsm3m1_cntx_finalize( cntx_t* cntx )
|
||||
@@ -122,10 +121,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for native execution.
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI,
|
||||
BLIS_PACKED_COL_PANELS_4MI,
|
||||
cntx );
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
|
||||
}
|
||||
|
||||
void bli_trsm4m1_cntx_finalize( cntx_t* cntx )
|
||||
@@ -174,9 +172,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E,
|
||||
BLIS_PACKED_COL_PANELS_1R,
|
||||
cntx );
|
||||
//bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E,
|
||||
// BLIS_PACKED_COL_PANELS_1R,
|
||||
// cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
|
||||
}
|
||||
else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
@@ -195,9 +195,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Set the pack_t schemas for the current induced method.
|
||||
bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R,
|
||||
BLIS_PACKED_COL_PANELS_1E,
|
||||
cntx );
|
||||
//bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R,
|
||||
// BLIS_PACKED_COL_PANELS_1E,
|
||||
// cntx );
|
||||
bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
|
||||
bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
/*const bool_t row_pref = !col_pref;*/ \
|
||||
const bool_t row_pref = !col_pref; \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
@@ -94,24 +94,19 @@ void PASTEMAC(ch,varname) \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* Sanity check: These should never occur because storage/preference
|
||||
agreement is handled at a higher level. */ \
|
||||
/*
|
||||
if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/* If beta has a non-zero imaginary component OR if c is stored with
|
||||
general stride, then we compute the alpha*a*b product into temporary
|
||||
storage and then accumulate that result into c afterwards. Note that
|
||||
the other two cases concerning disagreement between the storage of C
|
||||
and the output preference of the micro-kernel, should never occur
|
||||
(though we could handle them if they did occur). */ \
|
||||
and the output preference of the micro-kernel, should ONLY occur in
|
||||
the context of trsm, whereby this virtual micro-kernel is called
|
||||
directly from the trsm macro-kernel to update the micro-tile b11
|
||||
that exists within the packed row-panel of B. Indeed that is the
|
||||
reason those cases MUST be explicitly handled. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \
|
||||
/*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \
|
||||
else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
|
||||
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
|
||||
else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \
|
||||
else using_ct = FALSE; \
|
||||
\
|
||||
|
||||
Reference in New Issue
Block a user