mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Changed usage of virtual microkernel slots in cntx.
Details: - Changed the way virtual microkernels are handled in the context. Previously, there were query routines such as bli_cntx_get_l3_ukr_dt() which returned the native ukernel for a datatype if the method was equal to BLIS_NAT, or the virtual ukernel for that datatype if the method was some other value. Going forward, the context native and virtual ukernel slots will both be initialized to native ukernel function pointers for native execution, and for non-native execution the virtual ukernel pointer will be something else. This allows us to always query the virtual ukernel slot (from within, say, the macrokernel) without needing any logic in the query routine to decide which function pointer (native or virtual) to return. (Essentially, the logic has been shifted to init-time instead of compute-time.) This scheme will also allow generalized virtual ukernels as a way to insert extra logic in between the macrokernel and the native microkernel. - Initialize native contexts (in bli_cntx_ref.c) with native ukernel function addresses stored to the virtual ukernel slots pursuant to the above policy change. - Renamed all static functions that were native/virtual-ambiguous, such as bli_cntx_get_l3_ukr_dt() or bli_cntx_l3_ukr_prefers_cols_dt() pursuant to the above polilcy change. Those routines now use the substring "get_l3_vir_ukr" in their name instead of "get_l3_ukr". All of these functions were static functions defined in bli_cntx.h, and most uses were in level-3 front-ends and macrokernels. - Deprecated anti_pref bool_t in context, along with related functions such as bli_cntx_l3_ukr_eff_dislikes_storage_of(), now that 1m's panel-block execution is disabled.
This commit is contained in:
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Query the context for the function address of the current
|
||||
datatype's micro-kernel. */ \
|
||||
PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_ukr_dt( dt, kerid, cntx ); \
|
||||
PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
|
||||
\
|
||||
/* Invoke the typed function for the given datatype. */ \
|
||||
f( \
|
||||
@@ -91,7 +91,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Query the context for the function address of the current
|
||||
datatype's micro-kernel. */ \
|
||||
PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_ukr_dt( dt, kerid, cntx ); \
|
||||
PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
|
||||
\
|
||||
/* Invoke the typed function for the given datatype. */ \
|
||||
f( \
|
||||
@@ -129,7 +129,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Query the context for the function address of the current
|
||||
datatype's micro-kernel. */ \
|
||||
PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_ukr_dt( dt, kerid, cntx ); \
|
||||
PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
|
||||
\
|
||||
/* Invoke the typed function for the given datatype. */ \
|
||||
f( \
|
||||
|
||||
@@ -77,7 +77,7 @@ void bli_gemm_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
|
||||
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -192,7 +192,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -163,13 +163,13 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -163,13 +163,13 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -72,7 +72,7 @@ void bli_hemm_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
bli_obj_toggle_conj( &a_local );
|
||||
|
||||
@@ -92,7 +92,7 @@ void bli_her2k_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &bh_local );
|
||||
bli_obj_swap( &b_local, &ah_local );
|
||||
|
||||
@@ -77,7 +77,7 @@ void bli_herk_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_toggle_conj( &a_local );
|
||||
bli_obj_toggle_conj( &ah_local );
|
||||
|
||||
@@ -168,7 +168,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -177,7 +177,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -168,7 +168,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -177,7 +177,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -72,7 +72,7 @@ void bli_symm_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
|
||||
@@ -81,7 +81,7 @@ void bli_syr2k_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ void bli_syrk_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ void bli_trmm_front
|
||||
// NOTE: We disable the optimization for 1x1 matrices since the concept
|
||||
// of row- vs. column storage breaks down.
|
||||
if ( !bli_obj_is_1x1( &c_local ) )
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
bli_obj_induce_trans( &a_local );
|
||||
|
||||
@@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -104,7 +104,7 @@ void bli_trmm3_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_toggle_side( &side );
|
||||
bli_obj_induce_trans( &a_local );
|
||||
|
||||
@@ -162,9 +162,9 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -173,7 +173,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -162,9 +162,9 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -173,7 +173,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -167,9 +167,9 @@ void PASTEMAC(ch,varname) \
|
||||
is transposed so that all kernel instances are of the "left"
|
||||
variety (since those are the only trsm ukernels that exist). */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -167,9 +167,9 @@ void PASTEMAC(ch,varname) \
|
||||
is transposed so that all kernel instances are of the "left"
|
||||
variety (since those are the only trsm ukernels that exist). */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -544,8 +544,10 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
|
||||
// -- End variable argument section --
|
||||
|
||||
// Query the context for the addresses of:
|
||||
// - the l3 virtual ukernel func_t array
|
||||
// - the l3 native ukernel func_t array
|
||||
// - the l3 native ukernel preferences array
|
||||
func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
|
||||
mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
|
||||
|
||||
@@ -565,11 +567,18 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
|
||||
|
||||
// Index into the func_t and mbool_t for the current kernel id
|
||||
// being processed.
|
||||
func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ];
|
||||
func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ];
|
||||
mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ];
|
||||
|
||||
// Store the ukernel function pointer and preference values into
|
||||
// the context.
|
||||
// the context. Notice that we redundantly store the native
|
||||
// ukernel address in both the native and virtual ukernel slots
|
||||
// in the context. This is standard practice when creating a
|
||||
// native context. (Induced method contexts will overwrite the
|
||||
// virtual function pointer with the address of the appropriate
|
||||
// virtual ukernel.)
|
||||
bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
|
||||
bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
|
||||
bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
|
||||
}
|
||||
|
||||
@@ -60,8 +60,6 @@ typedef struct cntx_s
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
bool_t anti_pref;
|
||||
|
||||
dim_t* thrloop;
|
||||
|
||||
membrk_t* membrk;
|
||||
@@ -126,10 +124,6 @@ static pack_t bli_cntx_schema_c_panel( cntx_t* cntx )
|
||||
{
|
||||
return cntx->schema_c_panel;
|
||||
}
|
||||
static bool_t bli_cntx_anti_pref( cntx_t* cntx )
|
||||
{
|
||||
return cntx->anti_pref;
|
||||
}
|
||||
static dim_t* bli_cntx_thrloop( cntx_t* cntx )
|
||||
{
|
||||
return cntx->thrloop;
|
||||
@@ -166,10 +160,6 @@ static void bli_cntx_set_schema_ab_blockpanel( pack_t sa, pack_t sb, cntx_t* cnt
|
||||
bli_cntx_set_schema_a_block( sa, cntx );
|
||||
bli_cntx_set_schema_b_panel( sb, cntx );
|
||||
}
|
||||
static void bli_cntx_set_anti_pref( bool_t anti_pref, cntx_t* cntx )
|
||||
{
|
||||
cntx->anti_pref = anti_pref;
|
||||
}
|
||||
static void bli_cntx_set_membrk( membrk_t* membrk, cntx_t* cntx )
|
||||
{
|
||||
cntx->membrk = membrk;
|
||||
@@ -234,27 +224,6 @@ static dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static func_t* bli_cntx_get_l3_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
func_t* funcs;
|
||||
|
||||
if ( bli_cntx_method( (cntx) ) != BLIS_NAT )
|
||||
funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
else
|
||||
funcs = bli_cntx_l3_nat_ukrs_buf( cntx );
|
||||
|
||||
func_t* func = &funcs[ ukr_id ];
|
||||
|
||||
return func;
|
||||
}
|
||||
|
||||
static void* bli_cntx_get_l3_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
func_t* func = bli_cntx_get_l3_ukrs( ukr_id, cntx );
|
||||
|
||||
return bli_func_get_dt( dt, func );
|
||||
}
|
||||
|
||||
static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
@@ -487,55 +456,43 @@ static bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_i
|
||||
return !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
}
|
||||
|
||||
static bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
static bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
static bool_t bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
// For induced methods, return the ukernel storage preferences of the
|
||||
// corresponding real micro-kernel.
|
||||
// NOTE: This projection to real domain becomes unnecessary if you
|
||||
// set the exec_dt for 1m to the real projection of the storage
|
||||
// datatype.
|
||||
if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
dt = bli_dt_proj_to_real( dt );
|
||||
|
||||
return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
|
||||
}
|
||||
|
||||
static bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
static bool_t bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
// For induced methods, return the ukernel storage preferences of the
|
||||
// corresponding real micro-kernel.
|
||||
// NOTE: This projection to real domain becomes unnecessary if you
|
||||
// set the exec_dt for 1m to the real projection of the storage
|
||||
// datatype.
|
||||
if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
dt = bli_dt_proj_to_real( dt );
|
||||
|
||||
return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
|
||||
}
|
||||
|
||||
static bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
static bool_t bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( obj );
|
||||
// Note that we use the execution datatype, which may differ from the
|
||||
// storage datatype of C (though this would happen in very few situations).
|
||||
const num_t dt = bli_obj_exec_dt( obj );
|
||||
const bool_t ukr_prefers_rows
|
||||
= bli_cntx_l3_ukr_prefers_rows_dt( dt, ukr_id, cntx );
|
||||
= bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx );
|
||||
const bool_t ukr_prefers_cols
|
||||
= bli_cntx_l3_ukr_prefers_cols_dt( dt, ukr_id, cntx );
|
||||
= bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx );
|
||||
bool_t r_val = FALSE;
|
||||
|
||||
if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
|
||||
@@ -544,29 +501,9 @@ static bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cn
|
||||
return r_val;
|
||||
}
|
||||
|
||||
static bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
static bool_t bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
return !bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
}
|
||||
|
||||
static bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
static bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
// If the anti-preference is set, negate the result.
|
||||
if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val;
|
||||
|
||||
return r_val;
|
||||
return !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -584,7 +584,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
|
||||
// then query the ukernel function pointer for the given datatype from
|
||||
// that context.
|
||||
cntx_t* cntx = bli_gks_query_ind_cntx( method, dt );
|
||||
void* fp = bli_cntx_get_l3_ukr_dt( dt, ukr, cntx );
|
||||
void* fp = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx );
|
||||
|
||||
// Check whether the ukernel function pointer is NULL for the given
|
||||
// datatype. If it is NULL, return the string for not applicable.
|
||||
|
||||
@@ -1128,8 +1128,6 @@ typedef struct cntx_s
|
||||
pack_t schema_b_panel;
|
||||
pack_t schema_c_panel;
|
||||
|
||||
bool_t anti_pref;
|
||||
|
||||
dim_t thrloop[ BLIS_NUM_LOOPS ];
|
||||
|
||||
membrk_t* membrk;
|
||||
|
||||
@@ -60,9 +60,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
ctype* minus_one = PASTEMAC(ch,m1); \
|
||||
\
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
PASTECH(ch,trsm_ukr_ft) \
|
||||
trsm_ukr = bli_cntx_get_l3_ukr_dt( dt, trsmkerid, cntx ); \
|
||||
trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
|
||||
\
|
||||
/* lower: b11 = alpha * b11 - a10 * b01; */ \
|
||||
/* upper: b11 = alpha * b11 - a12 * b21; */ \
|
||||
|
||||
@@ -363,11 +363,11 @@ void GENBARNAME(cntx_init)
|
||||
|
||||
funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
|
||||
gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm1m_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm1m_l_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm1m_u_ukr_name );
|
||||
gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name );
|
||||
gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
|
||||
gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
|
||||
gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name );
|
||||
gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name );
|
||||
|
||||
|
||||
// -- Set level-3 native micro-kernels and preferences ---------------------
|
||||
@@ -467,7 +467,7 @@ void GENBARNAME(cntx_init)
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
|
||||
bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED, cntx );
|
||||
|
||||
bli_cntx_set_anti_pref( FALSE, cntx );
|
||||
//bli_cntx_set_anti_pref( FALSE, cntx );
|
||||
|
||||
bli_cntx_set_thrloop( 1, 1, 1, 1, 1, cntx );
|
||||
|
||||
@@ -726,7 +726,7 @@ void GENBAINAME(cntx_init)
|
||||
|
||||
// Initialize the blocksizes according to the micro-kernel preference as
|
||||
// well as the algorithm.
|
||||
if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_c_bp, 1m_r_pb.
|
||||
|
||||
@@ -754,7 +754,7 @@ void GENBAINAME(cntx_init)
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_r_bp, 1m_c_pb.
|
||||
|
||||
@@ -811,7 +811,7 @@ void GENBAINAME(cntx_init)
|
||||
}
|
||||
else if ( method == BLIS_1M )
|
||||
{
|
||||
const bool_t is_pb = FALSE;
|
||||
//const bool_t is_pb = FALSE;
|
||||
|
||||
// Set the anti-preference field to TRUE when executing a panel-block
|
||||
// algorithm, and FALSE otherwise. This will cause higher-level generic
|
||||
@@ -819,7 +819,7 @@ void GENBAINAME(cntx_init)
|
||||
// the micro-kernel output preference so that the two will come back into
|
||||
// agreement in the panel-block macro-kernel (which implemented in terms
|
||||
// of the block-panel macro-kernel with some induced transpositions).
|
||||
bli_cntx_set_anti_pref( is_pb, cntx );
|
||||
//bli_cntx_set_anti_pref( is_pb, cntx );
|
||||
}
|
||||
else // if ( method == BLIS_NAT )
|
||||
{
|
||||
|
||||
@@ -54,7 +54,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t row_pref = !col_pref; \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
|
||||
@@ -59,7 +59,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
PASTECH(ch,trsm_ukr_ft) \
|
||||
ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
|
||||
\
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
|
||||
@@ -87,7 +87,7 @@ void blx_gemm_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
|
||||
@@ -171,7 +171,7 @@ void PASTECH2(blx_,ch,varname) \
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
@@ -180,7 +180,7 @@ void PASTECH2(blx_,ch,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
Reference in New Issue
Block a user