mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Added 1m-specific APIs for bp, pb gemm algorithms.
Details:
- Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the
body of bli_gemm_cntl_create() replaced with a call to the former.
- Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now,
bli_cntl_free() can check if the thread parameter is NULL, and if so,
call the latter, and otherwise call the former.
- Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in
terms of bli_gemm1mxx_cntx_init(), which behaves the same as
bli_gemm1m_cntx_init() did before, except that an extra bool parameter
(is_pb) is used to support both bp and pb algorithms (including to
support the anti-preference field described below).
- Added support for "anti-preference" in context. The anti_pref field,
when true, will toggle the boolean return value of routines such as
bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of
causing BLIS to transpose the operation to achieve disagreement (rather
than agreement) between the storage of C and the micro-kernel output
preference. This disagreement is needed for panel-block implementations,
since they induce a transposition of the suboperation immediately before
the macro-kernel is called, which changes the apparent storage of C. For
now, anti-preference is used only with the pb algorithm for 1m (and not
with any other non-1m implementation).
- Defined new functions,
bli_cntx_l3_ukr_eff_prefers_storage_of()
bli_cntx_l3_ukr_eff_dislikes_storage_of()
bli_cntx_l3_nat_ukr_eff_prefers_storage_of()
bli_cntx_l3_nat_ukr_eff_dislikes_storage_of()
which are identical to their non-"eff" (effectively) counterparts except
that they take the anti-preference field of the context into account.
- Explicitly initialize the anti-pref field to FALSE in
bli_gks_cntx_set_l3_nat_ukr_prefs().
- Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel
in terms of the existing block-panel macro-kernel _ker_var2(). This
technique requires inducing transposes on all operands and swapping
the A and B.
- Changed bli_obj_induce_trans() macro so that pack-related fields are
also changed to reflect the induced transposition.
- Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily
specify the 1m algorithm (block-panel or panel-block).
- Renamed the following cntx_t-related macros:
bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block()
bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel()
bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel()
and updated all instantiations. Also updated the field names in the
cntx_t struct.
- Comment updates.
This commit is contained in:
committed by
prangana
parent
1d728ccb23
commit
4f61528d56
@@ -877,6 +877,12 @@ bli_obj_width_stored( obj )
|
||||
(obj).n_panel = n0; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_dims( m0, n0, obj ) \
|
||||
{ \
|
||||
bli_obj_set_panel_length( m0, obj ); \
|
||||
bli_obj_set_panel_width( n0, obj ); \
|
||||
}
|
||||
|
||||
#define bli_obj_set_panel_dim( panel_dim, obj ) \
|
||||
{ \
|
||||
(obj).pd = panel_dim; \
|
||||
@@ -985,6 +991,7 @@ bli_obj_width_stored( obj )
|
||||
#define bli_obj_induce_trans( obj ) \
|
||||
{ \
|
||||
{ \
|
||||
/* Induce transposition among basic fields. */ \
|
||||
dim_t m_ = bli_obj_length( obj ); \
|
||||
dim_t n_ = bli_obj_width( obj ); \
|
||||
inc_t rs_ = bli_obj_row_stride( obj ); \
|
||||
@@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj )
|
||||
\
|
||||
if ( bli_obj_is_upper_or_lower( obj ) ) \
|
||||
bli_obj_toggle_uplo( obj ); \
|
||||
\
|
||||
/* Induce transposition among packed fields. */ \
|
||||
dim_t m_padded_ = bli_obj_padded_length( obj ); \
|
||||
dim_t n_padded_ = bli_obj_padded_width( obj ); \
|
||||
dim_t m_panel_ = bli_obj_panel_length( obj ); \
|
||||
dim_t n_panel_ = bli_obj_panel_width( obj ); \
|
||||
\
|
||||
bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \
|
||||
bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \
|
||||
\
|
||||
/* Note that this macro DOES NOT touch the transposition bit! If
|
||||
the calling code is using this macro to handle an object whose
|
||||
|
||||
@@ -975,9 +975,11 @@ typedef struct cntx_s
|
||||
|
||||
opid_t family;
|
||||
ind_t method;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
pack_t schema_a_block;
|
||||
pack_t schema_b_panel;
|
||||
pack_t schema_c_panel;
|
||||
|
||||
bool_t anti_pref;
|
||||
|
||||
dim_t thrloop[ BLIS_NUM_LOOPS ];
|
||||
|
||||
|
||||
Reference in New Issue
Block a user