mirror of
https://github.com/amd/blis.git
synced 2026-05-21 08:58:30 +00:00
Simplify and rewrite reference packm kernels. (#610)
Details:
- Reorganized the way kernels are stored within the cntx_t structure so
that rather than having a function pointer for every supported size of
unrolled packm kernel (2xk, 3xk, 4xk, etc.), we store only two packm
kernels per datatype: one to pack MRxk micropanels and one to pack
NRxk micropanels.
- NOTE: The "bb" (broadcast B) reference kernels have been merged into
the "standard" kernels (packm [including 1er and unpackm], gemm,
trsm, gemmtrsm). This replication factor is controlled by
BLIS_BB[MN]_[sdcz] etc. Power9/10 needs testing since only a
replication factor of 1 has been tested. armsve also needs testing
since the MR value isn't available as a macro.
- Simplified the bli_cntx_*() APIs to conform to the new unified kernel
array within the cntx_t. Updated existing bli_cntx_init_<subconfig>()
function definitions for all subconfigurations.
- Consolidated all kernel id types (e.g. l1vkr_t, l1mkr_t, l3ukr_t,
etc.) into one kernel id type: ukr_t.
- Various edits, updates, and rewrites of reference kernels pursuant to
the aforementioned changes.
- Define compile-time macro constants (BLIS_MR_[sdcz], BLIS_NR_[sdcz],
and friends) in bli_kernel_macro_defs.h, but only when the macro
BLIS_IN_REF_KERNEL is defined by the build system.
- Loose ends:
- Still need to update documentation, including:
- docs/ConfigurationHowTo.md
- docs/KernelsHowTo.md
to reflect changes made in this commit.
This commit is contained in:
@@ -164,7 +164,7 @@ void PASTECH2(bao_,ch,varname) \
|
|||||||
function pointer type. */ \
|
function pointer type. */ \
|
||||||
/*
|
/*
|
||||||
PASTECH(ch,gemm_ukr_ft) \
|
PASTECH(ch,gemm_ukr_ft) \
|
||||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||||
*/ \
|
*/ \
|
||||||
\
|
\
|
||||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||||
@@ -175,7 +175,7 @@ void PASTECH2(bao_,ch,varname) \
|
|||||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||||
/ sizeof( ctype ) ] \
|
/ sizeof( ctype ) ] \
|
||||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||||
*/ \
|
*/ \
|
||||||
@@ -536,7 +536,7 @@ void PASTECH2(bao_,ch,varname) \
|
|||||||
/* Query the context for the microkernel address and cast it to its
|
/* Query the context for the microkernel address and cast it to its
|
||||||
function pointer type. */ \
|
function pointer type. */ \
|
||||||
PASTECH(ch,gemm_ukr_ft) \
|
PASTECH(ch,gemm_ukr_ft) \
|
||||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||||
\
|
\
|
||||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||||
temporary buffer are set so that they match the storage of the
|
temporary buffer are set so that they match the storage of the
|
||||||
@@ -545,7 +545,7 @@ void PASTECH2(bao_,ch,varname) \
|
|||||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||||
/ sizeof( ctype ) ] \
|
/ sizeof( ctype ) ] \
|
||||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||||
\
|
\
|
||||||
|
|||||||
@@ -137,7 +137,7 @@ void bao_gemmd_ex
|
|||||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||||
// prefers contiguous rows, transpose the entire operation to allow the
|
// prefers contiguous rows, transpose the entire operation to allow the
|
||||||
// micro-kernel to access elements of C in its preferred manner.
|
// micro-kernel to access elements of C in its preferred manner.
|
||||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
|
||||||
{
|
{
|
||||||
bli_obj_swap( &a_local, &b_local );
|
bli_obj_swap( &a_local, &b_local );
|
||||||
|
|
||||||
|
|||||||
@@ -163,7 +163,7 @@ void PASTECH2(bao_,ch,varname) \
|
|||||||
/* Query the context for the microkernel address and cast it to its
|
/* Query the context for the microkernel address and cast it to its
|
||||||
function pointer type. */ \
|
function pointer type. */ \
|
||||||
PASTECH(ch,gemm_ukr_ft) \
|
PASTECH(ch,gemm_ukr_ft) \
|
||||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||||
\
|
\
|
||||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||||
const inc_t jcstep_c = cs_c; \
|
const inc_t jcstep_c = cs_c; \
|
||||||
|
|||||||
@@ -55,15 +55,15 @@ void PASTECH2(bao_,ch,opname) \
|
|||||||
/* Note that we use panel_dim_max, not panel_dim, to query the packm
|
/* Note that we use panel_dim_max, not panel_dim, to query the packm
|
||||||
kernel function pointer. This means that we always use the same
|
kernel function pointer. This means that we always use the same
|
||||||
kernel, even for edge cases. */ \
|
kernel, even for edge cases. */ \
|
||||||
num_t dt = PASTEMAC(ch,type); \
|
num_t dt = PASTEMAC(ch,type); \
|
||||||
l1mkr_t ker_id = panel_dim_max; \
|
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f; \
|
PASTECH2(ch,opname,_ker_ft) f; \
|
||||||
\
|
\
|
||||||
/* Query the context for the packm kernel corresponding to the current
|
/* Query the context for the packm kernel corresponding to the current
|
||||||
panel dimension, or kernel id. If the id is invalid, the function will
|
panel dimension, or kernel id. If the id is invalid, the function will
|
||||||
return NULL. */ \
|
return NULL. */ \
|
||||||
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
|
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
|
||||||
\
|
\
|
||||||
/* If there exists a kernel implementation for the micro-panel dimension
|
/* If there exists a kernel implementation for the micro-panel dimension
|
||||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||||
|
|||||||
@@ -120,6 +120,8 @@ get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
|||||||
-DBLIS_CNAME=$(1) \
|
-DBLIS_CNAME=$(1) \
|
||||||
$(BUILD_CPPFLAGS) \
|
$(BUILD_CPPFLAGS) \
|
||||||
$(BUILD_SYMFLAGS) \
|
$(BUILD_SYMFLAGS) \
|
||||||
|
-DBLIS_IN_REF_KERNEL=1 \
|
||||||
|
-include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
|
||||||
)
|
)
|
||||||
|
|
||||||
get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
|
get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
|
||||||
@@ -129,6 +131,8 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
|
|||||||
-DBLIS_CNAME=$(1) \
|
-DBLIS_CNAME=$(1) \
|
||||||
$(BUILD_CPPFLAGS) \
|
$(BUILD_CPPFLAGS) \
|
||||||
$(BUILD_SYMFLAGS) \
|
$(BUILD_SYMFLAGS) \
|
||||||
|
-DBLIS_IN_REF_KERNEL=1 \
|
||||||
|
-include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
|
||||||
)
|
)
|
||||||
|
|
||||||
get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||||
|
|||||||
@@ -38,34 +38,42 @@
|
|||||||
void bli_cntx_init_a64fx( cntx_t* cntx )
|
void bli_cntx_init_a64fx( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
|
||||||
|
|
||||||
// Set default kernel blocksizes and functions.
|
// Set default kernel blocksizes and functions.
|
||||||
bli_cntx_init_a64fx_ref( cntx );
|
bli_cntx_init_a64fx_ref( cntx );
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
|
// level-3
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed,
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
|
||||||
|
|
||||||
|
// packm
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Set SVE-512 packing routine.
|
// Update the context with storage preferences.
|
||||||
bli_cntx_set_packm_kers
|
bli_cntx_set_ukr_prefs
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
|
||||||
// 12xk is not used and disabled for GCC 8-9 compatibility.
|
// level-3
|
||||||
// BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
cntx
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -80,66 +88,18 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
#if 0
|
|
||||||
// Initialize sup thresholds with architecture-appropriate values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );
|
|
||||||
|
|
||||||
// Initialize the context with the sup thresholds.
|
|
||||||
bli_cntx_set_l3_sup_thresh
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
BLIS_MT, &thresh[ BLIS_MT ],
|
|
||||||
BLIS_NT, &thresh[ BLIS_NT ],
|
|
||||||
BLIS_KT, &thresh[ BLIS_KT ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
4,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
|
||||||
// values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
|
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
|
||||||
// blocksizes for small/unpacked level-3 problems.
|
|
||||||
bli_cntx_set_l3_sup_blkszs
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Set A64FX cache sector sizes for each PE/CMG
|
// Set A64FX cache sector sizes for each PE/CMG
|
||||||
// SC Fugaku might disable users' setting cache sizes.
|
// SC Fugaku might disable users' setting cache sizes.
|
||||||
#if !defined(CACHE_SECTOR_SIZE_READONLY)
|
#if !defined(CACHE_SECTOR_SIZE_READONLY)
|
||||||
|
|||||||
52
config/a64fx/bli_kernel_defs_a64fx.h
Normal file
52
config/a64fx/bli_kernel_defs_a64fx.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 32
|
||||||
|
#define BLIS_MR_d 16
|
||||||
|
#define BLIS_MR_c 16
|
||||||
|
#define BLIS_MR_z 8
|
||||||
|
|
||||||
|
#define BLIS_NR_s 10
|
||||||
|
#define BLIS_NR_d 10
|
||||||
|
#define BLIS_NR_c 10
|
||||||
|
#define BLIS_NR_z 10
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -45,9 +45,6 @@ void bli_cntx_init_armsve( cntx_t* cntx )
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
#if 0
|
|
||||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Set default kernel blocksizes and functions.
|
// Set default kernel blocksizes and functions.
|
||||||
bli_cntx_init_armsve_ref( cntx );
|
bli_cntx_init_armsve_ref( cntx );
|
||||||
@@ -64,35 +61,55 @@ void bli_cntx_init_armsve( cntx_t* cntx )
|
|||||||
bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
|
bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
|
||||||
bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);
|
bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
|
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed,
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Set VL-specific packing routines if applicable.
|
// Set VL-specific packing routines if applicable.
|
||||||
if (m_r_d==16)
|
if ( m_r_d == 16 )
|
||||||
bli_cntx_set_packm_kers
|
{
|
||||||
|
bli_cntx_set_ukrs
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
||||||
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
|
||||||
cntx
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
else if (m_r_d==8)
|
}
|
||||||
bli_cntx_set_packm_kers
|
else if ( m_r_d == 8 )
|
||||||
|
{
|
||||||
|
bli_cntx_set_ukrs
|
||||||
(
|
(
|
||||||
1,
|
cntx,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
|
||||||
cntx
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
// s d c z
|
// s d c z
|
||||||
@@ -106,64 +123,16 @@ void bli_cntx_init_armsve( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
#if 0
|
|
||||||
// Initialize sup thresholds with architecture-appropriate values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );
|
|
||||||
|
|
||||||
// Initialize the context with the sup thresholds.
|
|
||||||
bli_cntx_set_l3_sup_thresh
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
BLIS_MT, &thresh[ BLIS_MT ],
|
|
||||||
BLIS_NT, &thresh[ BLIS_NT ],
|
|
||||||
BLIS_KT, &thresh[ BLIS_KT ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
4,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
|
||||||
// values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );
|
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
|
||||||
// blocksizes for small/unpacked level-3 problems.
|
|
||||||
bli_cntx_set_l3_sup_blkszs
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
58
config/armsve/bli_kernel_defs_armsve.h
Normal file
58
config/armsve/bli_kernel_defs_armsve.h
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
//
|
||||||
|
// The armsve configuration handles both 256-bit and 512-bit SVE vectors,
|
||||||
|
// so it is not possible to define specific register block sizes. Thus,
|
||||||
|
// armsve can't use reference kernels!
|
||||||
|
//
|
||||||
|
|
||||||
|
#define BLIS_MR_s -1
|
||||||
|
#define BLIS_MR_d -1
|
||||||
|
#define BLIS_MR_c -1
|
||||||
|
#define BLIS_MR_z -1
|
||||||
|
|
||||||
|
#define BLIS_NR_s 10
|
||||||
|
#define BLIS_NR_d 10
|
||||||
|
#define BLIS_NR_c 10
|
||||||
|
#define BLIS_NR_z 10
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,14 +43,28 @@ void bli_cntx_init_bgq( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE,
|
// level-3
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -65,13 +79,16 @@ void bli_cntx_init_bgq( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/bgq/bli_kernel_defs_bgq.h
Normal file
48
config/bgq/bli_kernel_defs_bgq.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_d 8
|
||||||
|
#define BLIS_MR_z 4
|
||||||
|
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,16 +43,32 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, FALSE,
|
// level-3
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4,
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -67,13 +83,16 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/bulldozer/bli_kernel_defs_bulldozer.h
Normal file
52
config/bulldozer/bli_kernel_defs_bulldozer.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 4
|
||||||
|
#define BLIS_MR_c 8
|
||||||
|
#define BLIS_MR_z 4
|
||||||
|
|
||||||
|
#define BLIS_NR_s 8
|
||||||
|
#define BLIS_NR_d 6
|
||||||
|
#define BLIS_NR_c 4
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE,
|
// level-3
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -73,13 +87,16 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/cortexa15/bli_kernel_defs_cortexa15.h
Normal file
48
config/cortexa15/bli_kernel_defs_cortexa15.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 4
|
||||||
|
#define BLIS_MR_d 4
|
||||||
|
|
||||||
|
#define BLIS_NR_s 4
|
||||||
|
#define BLIS_NR_d 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
|
// level-3
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/cortexa53/bli_kernel_defs_cortexa53.h
Normal file
48
config/cortexa53/bli_kernel_defs_cortexa53.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
|
||||||
|
#define BLIS_NR_s 12
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
|
// level-3
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/cortexa57/bli_kernel_defs_cortexa57.h
Normal file
48
config/cortexa57/bli_kernel_defs_cortexa57.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
|
||||||
|
#define BLIS_NR_s 12
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE,
|
// level-3
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/cortexa9/bli_kernel_defs_cortexa9.h
Normal file
48
config/cortexa9/bli_kernel_defs_cortexa9.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 4
|
||||||
|
#define BLIS_MR_d 4
|
||||||
|
|
||||||
|
#define BLIS_NR_s 4
|
||||||
|
#define BLIS_NR_d 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,16 +43,32 @@ void bli_cntx_init_excavator( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
|
// level-3
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3,
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -67,13 +83,16 @@ void bli_cntx_init_excavator( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/excavator/bli_kernel_defs_excavator.h
Normal file
52
config/excavator/bli_kernel_defs_excavator.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 16
|
||||||
|
#define BLIS_MR_d 8
|
||||||
|
#define BLIS_MR_c 4
|
||||||
|
#define BLIS_MR_z 2
|
||||||
|
|
||||||
|
#define BLIS_NR_s 3
|
||||||
|
#define BLIS_NR_d 3
|
||||||
|
#define BLIS_NR_c 2
|
||||||
|
#define BLIS_NR_z 2
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -37,32 +37,60 @@
|
|||||||
void bli_cntx_init_firestorm( cntx_t* cntx )
|
void bli_cntx_init_firestorm( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
|
||||||
|
|
||||||
// Set default kernel blocksizes and functions.
|
// Set default kernel blocksizes and functions.
|
||||||
bli_cntx_init_firestorm_ref( cntx );
|
bli_cntx_init_firestorm_ref( cntx );
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
|
// level-3
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
|
||||||
|
|
||||||
|
// packm
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
|
||||||
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Update the context with optimized packm kernels.
|
// Update the context with storage preferences.
|
||||||
bli_cntx_set_packm_kers
|
bli_cntx_set_ukr_prefs
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
|
|
||||||
BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
|
// level-3
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
cntx
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -73,72 +101,47 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
|
|||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 );
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize sup thresholds with architecture-appropriate values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MT ], -1, 99, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NT ], -1, 99, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KT ], -1, 99, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||||
|
// values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 );
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
// Update the context with the current architecture's register and cache
|
||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// sup thresholds
|
||||||
|
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
|
||||||
|
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
|
||||||
|
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
|
||||||
|
|
||||||
// Initialize sup thresholds with architecture-appropriate values.
|
// level-3 sup
|
||||||
// s d c z
|
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 );
|
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 );
|
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 );
|
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
|
||||||
|
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
|
||||||
|
|
||||||
// Initialize the context with the sup thresholds.
|
BLIS_VA_END
|
||||||
bli_cntx_set_l3_sup_thresh
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
BLIS_MT, &thresh[ BLIS_MT ],
|
|
||||||
BLIS_NT, &thresh[ BLIS_NT ],
|
|
||||||
BLIS_KT, &thresh[ BLIS_KT ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
8,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
|
||||||
// values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 );
|
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
|
||||||
// blocksizes for small/unpacked level-3 problems.
|
|
||||||
bli_cntx_set_l3_sup_blkszs
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
|
||||||
cntx
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/firestorm/bli_kernel_defs_firestorm.h
Normal file
48
config/firestorm/bli_kernel_defs_firestorm.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
|
||||||
|
#define BLIS_NR_s 12
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
42
config/generic/bli_kernel_defs_generic.h
Normal file
42
config/generic/bli_kernel_defs_generic.h
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -35,79 +35,58 @@
|
|||||||
|
|
||||||
#include "blis.h"
|
#include "blis.h"
|
||||||
|
|
||||||
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
|
|
||||||
|
|
||||||
void bli_cntx_init_haswell( cntx_t* cntx )
|
void bli_cntx_init_haswell( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
|
||||||
|
|
||||||
// Set default kernel blocksizes and functions.
|
// Set default kernel blocksizes and functions.
|
||||||
bli_cntx_init_haswell_ref( cntx );
|
bli_cntx_init_haswell_ref( cntx );
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
8,
|
cntx,
|
||||||
|
|
||||||
// gemm
|
// gemm
|
||||||
#if 1
|
#if 1
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
|
||||||
#else
|
#else
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6,
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE,
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE,
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3,
|
||||||
#endif
|
#endif
|
||||||
// gemmtrsm_l
|
// gemmtrsm_l
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
|
||||||
|
|
||||||
// gemmtrsm_u
|
// gemmtrsm_u
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
// Update the context with optimized packm kernels.
|
// packm
|
||||||
bli_cntx_set_packm_kers
|
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
||||||
(
|
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
||||||
8,
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
||||||
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
||||||
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
|
||||||
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
|
||||||
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Update the context with optimized level-1f kernels.
|
|
||||||
bli_cntx_set_l1f_kers
|
|
||||||
(
|
|
||||||
4,
|
|
||||||
// axpyf
|
// axpyf
|
||||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||||
// dotxf
|
// dotxf
|
||||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1v kernels.
|
|
||||||
bli_cntx_set_l1v_kers
|
|
||||||
(
|
|
||||||
10,
|
|
||||||
|
|
||||||
// amaxv
|
// amaxv
|
||||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||||
@@ -137,7 +116,74 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
|||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||||
#endif
|
#endif
|
||||||
cntx
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// gemm
|
||||||
|
#if 1
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
#else
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
#endif
|
||||||
|
// gemmtrsm_l
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// gemmtrsm_u
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -161,97 +207,54 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
|||||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
|
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
|
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Initialize sup thresholds with architecture-appropriate values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 201, 201, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 201, 201, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 201, 201, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||||
|
// values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1,
|
||||||
|
9, 9, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 );
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
// Update the context with the current architecture's register and cache
|
||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 7,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
|
|
||||||
// level-1f
|
// level-1f
|
||||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// gemmsup thresholds
|
||||||
|
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
|
||||||
|
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
|
||||||
|
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
|
||||||
|
|
||||||
// Initialize sup thresholds with architecture-appropriate values.
|
// level-3 sup
|
||||||
// s d c z
|
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 201, 201, -1, -1 );
|
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 201, 201, -1, -1 );
|
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 201, 201, -1, -1 );
|
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
|
||||||
|
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
|
||||||
|
|
||||||
// Initialize the context with the sup thresholds.
|
BLIS_VA_END
|
||||||
bli_cntx_set_l3_sup_thresh
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
BLIS_MT, &thresh[ BLIS_MT ],
|
|
||||||
BLIS_NT, &thresh[ BLIS_NT ],
|
|
||||||
BLIS_KT, &thresh[ BLIS_KT ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
// Initialize the context with the sup handlers.
|
|
||||||
bli_cntx_set_l3_sup_handlers
|
|
||||||
(
|
|
||||||
1,
|
|
||||||
BLIS_GEMM, bli_gemmsup_ref,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
16,
|
|
||||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
|
|
||||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
|
||||||
// values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
|
|
||||||
9, 9, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
|
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
|
||||||
// blocksizes for small/unpacked level-3 problems.
|
|
||||||
bli_cntx_set_l3_sup_blkszs
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
|
||||||
cntx
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/haswell/bli_kernel_defs_haswell.h
Normal file
52
config/haswell/bli_kernel_defs_haswell.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 6
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
#define BLIS_MR_c 3
|
||||||
|
#define BLIS_MR_z 3
|
||||||
|
|
||||||
|
#define BLIS_NR_s 16
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
#define BLIS_NR_c 8
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,13 +43,26 @@ void bli_cntx_init_knc( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
1,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, TRUE,
|
|
||||||
cntx
|
// level-3
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -66,13 +79,16 @@ void bli_cntx_init_knc( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/knc/bli_kernel_defs_knc.h
Normal file
48
config/knc/bli_kernel_defs_knc.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_d 30
|
||||||
|
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
|
||||||
|
#define BLIS_PACKMR_d 32
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,47 +43,33 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized packm kernels.
|
// level-3
|
||||||
bli_cntx_set_packm_kers
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16,
|
||||||
(
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8,
|
||||||
2,
|
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
|
// packm
|
||||||
BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
|
||||||
cntx
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1f kernels.
|
|
||||||
bli_cntx_set_l1f_kers
|
|
||||||
(
|
|
||||||
4,
|
|
||||||
// axpyf
|
// axpyf
|
||||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||||
// dotxf
|
|
||||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
// dotxf
|
||||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||||
cntx
|
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1v kernels.
|
|
||||||
bli_cntx_set_l1v_kers
|
|
||||||
(
|
|
||||||
10,
|
|
||||||
#if 1
|
#if 1
|
||||||
// amaxv
|
// amaxv
|
||||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// axpyv
|
// axpyv
|
||||||
#if 0
|
#if 0
|
||||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||||
@@ -92,12 +78,15 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
|||||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// dotv
|
// dotv
|
||||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||||
|
|
||||||
// dotxv
|
// dotxv
|
||||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||||
|
|
||||||
// scalv
|
// scalv
|
||||||
#if 0
|
#if 0
|
||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||||
@@ -106,7 +95,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
|||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||||
#endif
|
#endif
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -125,17 +127,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 7,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
|
|
||||||
// level-1f
|
// level-1f
|
||||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/knl/bli_kernel_defs_knl.h
Normal file
48
config/knl/bli_kernel_defs_knl.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 24
|
||||||
|
#define BLIS_MR_d 24
|
||||||
|
|
||||||
|
#define BLIS_NR_s 16
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -66,7 +66,7 @@ void bli_cntx_init_armv7a( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
5,
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 7,
|
7,
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
|
|||||||
@@ -43,18 +43,36 @@ void bli_cntx_init_penryn( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, FALSE,
|
//level-3
|
||||||
//BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4,
|
||||||
//BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4,
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, FALSE,
|
//BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4,
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, FALSE,
|
//BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4,
|
||||||
cntx
|
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4,
|
||||||
|
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
//level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
//BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
//BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -69,13 +87,16 @@ void bli_cntx_init_penryn( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-1
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/penryn/bli_kernel_defs_penryn.h
Normal file
48
config/penryn/bli_kernel_defs_penryn.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 4
|
||||||
|
|
||||||
|
#define BLIS_NR_s 4
|
||||||
|
#define BLIS_NR_d 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,16 +43,32 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
|
// level-3
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3,
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -67,13 +83,16 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/piledriver/bli_kernel_defs_piledriver.h
Normal file
52
config/piledriver/bli_kernel_defs_piledriver.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 16
|
||||||
|
#define BLIS_MR_d 8
|
||||||
|
#define BLIS_MR_c 4
|
||||||
|
#define BLIS_MR_z 2
|
||||||
|
|
||||||
|
#define BLIS_NR_s 3
|
||||||
|
#define BLIS_NR_d 3
|
||||||
|
#define BLIS_NR_c 2
|
||||||
|
#define BLIS_NR_z 2
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -34,35 +34,6 @@
|
|||||||
|
|
||||||
#include "blis.h"
|
#include "blis.h"
|
||||||
|
|
||||||
// Instantiate prototypes for packm kernels.
|
|
||||||
PACKM_KER_PROT( float, s, packm_6xk_bb4_power10_ref )
|
|
||||||
PACKM_KER_PROT( double, d, packm_6xk_bb2_power10_ref )
|
|
||||||
|
|
||||||
// Instantiate prototypes for level-3 kernels.
|
|
||||||
GEMM_UKR_PROT( float, s, gemmbb_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power10_ref )
|
|
||||||
TRSM_UKR_PROT( float, s, trsmbb_l_power10_ref )
|
|
||||||
TRSM_UKR_PROT( float, s, trsmbb_u_power10_ref )
|
|
||||||
|
|
||||||
GEMM_UKR_PROT( double, d, gemmbb_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref )
|
|
||||||
TRSM_UKR_PROT( double, d, trsmbb_l_power10_ref )
|
|
||||||
TRSM_UKR_PROT( double, d, trsmbb_u_power10_ref )
|
|
||||||
|
|
||||||
GEMM_UKR_PROT( scomplex, c, gemmbb_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref )
|
|
||||||
TRSM_UKR_PROT( scomplex, c, trsmbb_l_power10_ref )
|
|
||||||
TRSM_UKR_PROT( scomplex, c, trsmbb_u_power10_ref )
|
|
||||||
|
|
||||||
GEMM_UKR_PROT( dcomplex, z, gemmbb_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref )
|
|
||||||
TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power10_ref )
|
|
||||||
TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power10_ref )
|
|
||||||
|
|
||||||
void bli_cntx_init_power10( cntx_t* cntx )
|
void bli_cntx_init_power10( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
@@ -72,51 +43,38 @@ void bli_cntx_init_power10( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
12,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, TRUE,
|
|
||||||
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power10_ref, FALSE,
|
// level-3
|
||||||
BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power10_ref, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8,
|
||||||
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, TRUE,
|
BLIS_VA_END
|
||||||
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power10_ref, FALSE,
|
|
||||||
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power10_ref, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power10_ref, FALSE,
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref, FALSE,
|
|
||||||
BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power10_ref, FALSE,
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref, FALSE,
|
|
||||||
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref, FALSE,
|
|
||||||
cntx
|
|
||||||
);
|
);
|
||||||
|
|
||||||
// Update the context with customized virtual [gemm]trsm micro-kernels.
|
// Update the context with storage preferences.
|
||||||
bli_cntx_set_l3_vir_ukrs
|
bli_cntx_set_ukr_prefs
|
||||||
(
|
(
|
||||||
8,
|
cntx,
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power10_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power10_ref,
|
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power10_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power10_ref,
|
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref,
|
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized packm kernels.
|
// level-3
|
||||||
bli_cntx_set_packm_kers
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
(
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
2,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power10_ref,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power10_ref,
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
cntx
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// s d c z
|
// s d c z
|
||||||
@@ -131,14 +89,16 @@ void bli_cntx_init_power10( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
51
config/power10/bli_kernel_defs_power10.h
Normal file
51
config/power10/bli_kernel_defs_power10.h
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 8
|
||||||
|
|
||||||
|
#define BLIS_NR_s 16
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
|
||||||
|
#define BLIS_BBN_s 4
|
||||||
|
#define BLIS_BBN_d 2
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,13 +43,26 @@ void bli_cntx_init_power7( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
1,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE,
|
|
||||||
cntx
|
// level-3
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -64,13 +77,16 @@ void bli_cntx_init_power7( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
46
config/power7/bli_kernel_defs_power7.h
Normal file
46
config/power7/bli_kernel_defs_power7.h
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_d 8
|
||||||
|
|
||||||
|
#define BLIS_NR_d 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -34,35 +34,6 @@
|
|||||||
|
|
||||||
#include "blis.h"
|
#include "blis.h"
|
||||||
|
|
||||||
// Instantiate prototypes for packm kernels.
|
|
||||||
PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref )
|
|
||||||
PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref )
|
|
||||||
|
|
||||||
// Instantiate prototypes for level-3 kernels.
|
|
||||||
GEMM_UKR_PROT( float, s, gemmbb_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref )
|
|
||||||
TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref )
|
|
||||||
TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref )
|
|
||||||
|
|
||||||
GEMM_UKR_PROT( double, d, gemmbb_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref )
|
|
||||||
TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref )
|
|
||||||
TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref )
|
|
||||||
|
|
||||||
GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref )
|
|
||||||
TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref )
|
|
||||||
TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref )
|
|
||||||
|
|
||||||
GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref )
|
|
||||||
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref )
|
|
||||||
TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref )
|
|
||||||
TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref )
|
|
||||||
|
|
||||||
void bli_cntx_init_power9( cntx_t* cntx )
|
void bli_cntx_init_power9( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
@@ -72,50 +43,37 @@ void bli_cntx_init_power9( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
12,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE,
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE,
|
|
||||||
BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE,
|
|
||||||
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
|
// level-3
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6,
|
||||||
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE,
|
BLIS_VA_END
|
||||||
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE,
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE,
|
|
||||||
BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE,
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE,
|
|
||||||
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE,
|
|
||||||
cntx
|
|
||||||
);
|
);
|
||||||
|
|
||||||
// Update the context with customized virtual [gemm]trsm micro-kernels.
|
// Update the context with storage preferences.
|
||||||
bli_cntx_set_l3_vir_ukrs
|
bli_cntx_set_ukr_prefs
|
||||||
(
|
(
|
||||||
8,
|
cntx,
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref,
|
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref,
|
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref,
|
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized packm kernels.
|
// level-3
|
||||||
bli_cntx_set_packm_kers
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
(
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
2,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref,
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
cntx
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
@@ -131,14 +89,15 @@ void bli_cntx_init_power9( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
49
config/power9/bli_kernel_defs_power9.h
Normal file
49
config/power9/bli_kernel_defs_power9.h
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_d 12
|
||||||
|
|
||||||
|
#define BLIS_NR_d 6
|
||||||
|
|
||||||
|
#define BLIS_BBN_s 4
|
||||||
|
#define BLIS_BBN_d 2
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,16 +43,32 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE,
|
// level-3
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4,
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -67,13 +83,16 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/sandybridge/bli_kernel_defs_sandybridge.h
Normal file
52
config/sandybridge/bli_kernel_defs_sandybridge.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 8
|
||||||
|
#define BLIS_MR_c 8
|
||||||
|
#define BLIS_MR_z 4
|
||||||
|
|
||||||
|
#define BLIS_NR_s 8
|
||||||
|
#define BLIS_NR_d 4
|
||||||
|
#define BLIS_NR_c 4
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,39 +43,29 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
// gemm
|
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
|
// level-3
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2,
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14,
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1f kernels.
|
|
||||||
bli_cntx_set_l1f_kers
|
|
||||||
(
|
|
||||||
4,
|
|
||||||
// axpyf
|
// axpyf
|
||||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||||
|
|
||||||
// dotxf
|
// dotxf
|
||||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1v kernels.
|
|
||||||
bli_cntx_set_l1v_kers
|
|
||||||
(
|
|
||||||
10,
|
|
||||||
#if 1
|
#if 1
|
||||||
// amaxv
|
// amaxv
|
||||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// axpyv
|
// axpyv
|
||||||
#if 0
|
#if 0
|
||||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||||
@@ -84,12 +74,15 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
|||||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// dotv
|
// dotv
|
||||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||||
|
|
||||||
// dotxv
|
// dotxv
|
||||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||||
|
|
||||||
// scalv
|
// scalv
|
||||||
#if 0
|
#if 0
|
||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||||
@@ -98,7 +91,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
|||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||||
#endif
|
#endif
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -116,17 +122,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 7,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
|
|
||||||
// level-1f
|
// level-1f
|
||||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/skx/bli_kernel_defs_skx.h
Normal file
48
config/skx/bli_kernel_defs_skx.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 32
|
||||||
|
#define BLIS_MR_d 16
|
||||||
|
|
||||||
|
#define BLIS_NR_s 12
|
||||||
|
#define BLIS_NR_d 14
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,16 +43,32 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
4,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
|
// level-3
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3,
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -67,13 +83,16 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/steamroller/bli_kernel_defs_steamroller.h
Normal file
52
config/steamroller/bli_kernel_defs_steamroller.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 16
|
||||||
|
#define BLIS_MR_d 8
|
||||||
|
#define BLIS_MR_c 4
|
||||||
|
#define BLIS_MR_z 2
|
||||||
|
|
||||||
|
#define BLIS_NR_s 3
|
||||||
|
#define BLIS_NR_d 3
|
||||||
|
#define BLIS_NR_c 2
|
||||||
|
#define BLIS_NR_z 2
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -45,34 +45,44 @@ void bli_cntx_init_template( cntx_t* cntx )
|
|||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels and
|
||||||
// their storage preferences.
|
// their storage preferences.
|
||||||
bli_cntx_set_l3_nat_ukrs
|
bli_cntx_set_ukrs
|
||||||
(
|
(
|
||||||
5,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, FALSE,
|
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE,
|
|
||||||
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, FALSE,
|
|
||||||
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, FALSE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1f kernels.
|
// level-3
|
||||||
bli_cntx_set_l1f_kers
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt,
|
||||||
(
|
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt,
|
||||||
|
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt,
|
||||||
|
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt,
|
||||||
|
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt,
|
||||||
|
|
||||||
|
// level-1f
|
||||||
BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt,
|
BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt,
|
||||||
BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt,
|
BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt,
|
||||||
BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt,
|
BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt,
|
||||||
BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt,
|
BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt,
|
||||||
BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt,
|
BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt,
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1v kernels.
|
// level-1v
|
||||||
bli_cntx_set_l1v_kers
|
|
||||||
(
|
|
||||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt,
|
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt,
|
||||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt,
|
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -87,13 +97,16 @@ void bli_cntx_init_template( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
60
config/template/bli_kernel_defs_template.h
Normal file
60
config/template/bli_kernel_defs_template.h
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
//
|
||||||
|
// Only defined for block sizes which are not taken as the default (i.e. when
|
||||||
|
// an optimized kernel is provided).
|
||||||
|
//
|
||||||
|
|
||||||
|
#define BLIS_MR_z 4
|
||||||
|
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//
|
||||||
|
// PACKMR/PACKNR do not need to be defined unless they are different from the
|
||||||
|
// "normal" MR/NR.
|
||||||
|
//
|
||||||
|
|
||||||
|
//#define BLIS_PACKMR_z 4
|
||||||
|
|
||||||
|
//#define BLIS_PACKNR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -43,14 +43,28 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
|
|||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
|
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
|
// level-3
|
||||||
cntx
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12,
|
||||||
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -65,13 +79,16 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
cntx,
|
||||||
|
|
||||||
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48
config/thunderx2/bli_kernel_defs_thunderx2.h
Normal file
48
config/thunderx2/bli_kernel_defs_thunderx2.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 8
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
|
||||||
|
#define BLIS_NR_s 12
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -40,92 +40,107 @@
|
|||||||
void bli_cntx_init_zen( cntx_t* cntx )
|
void bli_cntx_init_zen( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
|
||||||
|
|
||||||
// Set default kernel blocksizes and functions.
|
// Set default kernel blocksizes and functions.
|
||||||
bli_cntx_init_zen_ref( cntx );
|
bli_cntx_init_zen_ref( cntx );
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
8,
|
cntx,
|
||||||
|
|
||||||
// gemm
|
// gemm
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
|
||||||
|
|
||||||
// gemmtrsm_l
|
// gemmtrsm_l
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
|
||||||
|
|
||||||
// gemmtrsm_u
|
// gemmtrsm_u
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
|
||||||
|
|
||||||
cntx
|
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
|
||||||
);
|
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
|
||||||
|
|
||||||
#if 1
|
// gemmsup
|
||||||
// Update the context with optimized packm kernels.
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
bli_cntx_set_packm_kers
|
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
|
||||||
(
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
8,
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
|
||||||
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
|
||||||
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
cntx
|
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
);
|
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
|
#if 0
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Update the context with optimized level-1f kernels.
|
#if 0
|
||||||
bli_cntx_set_l1f_kers
|
// NOTE: This set of kernels is likely broken and therefore disabled.
|
||||||
(
|
BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
4,
|
BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// packm
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
||||||
|
|
||||||
// axpyf
|
// axpyf
|
||||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||||
|
|
||||||
// dotxf
|
// dotxf
|
||||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||||
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1v kernels.
|
|
||||||
bli_cntx_set_l1v_kers
|
|
||||||
(
|
|
||||||
16,
|
|
||||||
|
|
||||||
// amaxv
|
// amaxv
|
||||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||||
|
|
||||||
// axpyv
|
// axpyv
|
||||||
#if 0
|
|
||||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
|
||||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
|
||||||
#else
|
|
||||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
// copyv
|
// copyv
|
||||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||||
#endif
|
|
||||||
|
|
||||||
// dotv
|
// dotv
|
||||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||||
@@ -136,25 +151,76 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
|||||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||||
|
|
||||||
// scalv
|
// scalv
|
||||||
#if 0
|
|
||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
|
||||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
|
||||||
#else
|
|
||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
// setv
|
// setv
|
||||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
||||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
||||||
|
|
||||||
// swapv
|
// swapv
|
||||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// gemm
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
|
||||||
|
// gemmtrsm_l
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// gemmtrsm_u
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// NOTE: This set of kernels is likely broken and therefore disabled.
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
cntx
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -195,131 +261,74 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
|||||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize sup thresholds with architecture-appropriate values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 512, 256, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 440, 220, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||||
|
// values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1,
|
||||||
|
9, 9, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, -1, -1 );
|
||||||
|
#if 0
|
||||||
|
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3,
|
||||||
|
9, 9, 3, 3 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 );
|
||||||
|
#endif
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
// Update the context with the current architecture's register and cache
|
||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 7,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
|
|
||||||
// level-1f
|
// level-1f
|
||||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||||
cntx
|
|
||||||
|
// sup thresholds
|
||||||
|
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
|
||||||
|
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
|
||||||
|
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
|
||||||
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
|
||||||
|
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
|
||||||
|
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
|
||||||
|
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
|
||||||
|
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Initialize sup thresholds with architecture-appropriate values.
|
#if 0
|
||||||
// s d c z
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
|
|
||||||
|
|
||||||
// Initialize the context with the sup thresholds.
|
|
||||||
bli_cntx_set_l3_sup_thresh
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
BLIS_MT, &thresh[ BLIS_MT ],
|
|
||||||
BLIS_NT, &thresh[ BLIS_NT ],
|
|
||||||
BLIS_KT, &thresh[ BLIS_KT ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Initialize the context with the sup handlers.
|
// Initialize the context with the sup handlers.
|
||||||
bli_cntx_set_l3_sup_handlers
|
bli_cntx_set_l3_sup_handlers
|
||||||
(
|
(
|
||||||
1,
|
cntx,
|
||||||
|
|
||||||
BLIS_GEMM, bli_gemmsup_ref,
|
BLIS_GEMM, bli_gemmsup_ref,
|
||||||
//BLIS_GEMMT, bli_gemmtsup_ref,
|
//BLIS_GEMMT, bli_gemmtsup_ref,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
16,
|
|
||||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
|
|
||||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
#if 0
|
|
||||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0
|
|
||||||
// NOTE: This set of kernels is likely broken and therefore disabled.
|
|
||||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
|
|
||||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
#endif
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
|
||||||
// values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
|
|
||||||
9, 9, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
|
|
||||||
#if 0
|
|
||||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
|
|
||||||
9, 9, 3, 3 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
|
||||||
// blocksizes for small/unpacked level-3 problems.
|
|
||||||
bli_cntx_set_l3_sup_blkszs
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/zen/bli_kernel_defs_zen.h
Normal file
52
config/zen/bli_kernel_defs_zen.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 6
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
#define BLIS_MR_c 3
|
||||||
|
#define BLIS_MR_z 3
|
||||||
|
|
||||||
|
#define BLIS_NR_s 16
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
#define BLIS_NR_c 8
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -38,73 +38,94 @@
|
|||||||
void bli_cntx_init_zen2( cntx_t* cntx )
|
void bli_cntx_init_zen2( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
|
||||||
|
|
||||||
// Set default kernel blocksizes and functions.
|
// Set default kernel blocksizes and functions.
|
||||||
bli_cntx_init_zen2_ref( cntx );
|
bli_cntx_init_zen2_ref( cntx );
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
8,
|
cntx,
|
||||||
|
|
||||||
// gemm
|
// gemm
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
|
||||||
|
|
||||||
// gemmtrsm_l
|
// gemmtrsm_l
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
|
||||||
|
|
||||||
// gemmtrsm_u
|
// gemmtrsm_u
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
|
||||||
|
|
||||||
cntx
|
// level-3 sup
|
||||||
);
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
|
||||||
#if 1
|
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
// Update the context with optimized packm kernels.
|
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
|
||||||
bli_cntx_set_packm_kers
|
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
(
|
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
8,
|
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
|
||||||
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
#if 0
|
||||||
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
|
||||||
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
cntx
|
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
);
|
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Update the context with optimized level-1f kernels.
|
#if 0
|
||||||
bli_cntx_set_l1f_kers
|
// NOTE: This set of kernels is likely broken and therefore disabled.
|
||||||
(
|
BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
4,
|
BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// packm
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
||||||
|
|
||||||
// axpyf
|
// axpyf
|
||||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
|
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
|
||||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
|
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
|
||||||
|
|
||||||
// dotxf
|
// dotxf
|
||||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||||
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1v kernels.
|
|
||||||
bli_cntx_set_l1v_kers
|
|
||||||
(
|
|
||||||
16,
|
|
||||||
|
|
||||||
// amaxv
|
// amaxv
|
||||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||||
@@ -127,18 +148,59 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
|||||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||||
|
|
||||||
//swap
|
//swap
|
||||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||||
|
|
||||||
//copy
|
//copy
|
||||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||||
|
|
||||||
//set
|
//set
|
||||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
||||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
||||||
|
|
||||||
cntx
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// gemm
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
|
||||||
|
// gemmtrsm_l
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// gemmtrsm_u
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// level-3 sup
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -158,130 +220,73 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
|||||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize sup thresholds with architecture-appropriate values.
|
||||||
|
// s d c z
|
||||||
|
#if 1
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 500, 249, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 500, 249, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 500, 249, -1, -1 );
|
||||||
|
#else
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 100000, 100000, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 100000, 100000, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 100000, 100000, -1, -1 );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||||
|
// values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1,
|
||||||
|
9, 9, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 );
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
// Update the context with the current architecture's register and cache
|
||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 7,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
|
|
||||||
// level-1f
|
// level-1f
|
||||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||||
cntx
|
|
||||||
|
// sup thresholds
|
||||||
|
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
|
||||||
|
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
|
||||||
|
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
|
||||||
|
|
||||||
|
// level-3 sup
|
||||||
|
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP,
|
||||||
|
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP,
|
||||||
|
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP,
|
||||||
|
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
|
||||||
|
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Initialize sup thresholds with architecture-appropriate values.
|
|
||||||
// s d c z
|
|
||||||
#if 1
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 );
|
|
||||||
#else
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Initialize the context with the sup thresholds.
|
|
||||||
bli_cntx_set_l3_sup_thresh
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
BLIS_MT, &thresh[ BLIS_MT ],
|
|
||||||
BLIS_NT, &thresh[ BLIS_NT ],
|
|
||||||
BLIS_KT, &thresh[ BLIS_KT ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// Initialize the context with the sup handlers.
|
// Initialize the context with the sup handlers.
|
||||||
bli_cntx_set_l3_sup_handlers
|
bli_cntx_set_l3_sup_handlers
|
||||||
(
|
(
|
||||||
1,
|
cntx,
|
||||||
|
|
||||||
BLIS_GEMM, bli_gemmsup_ref,
|
BLIS_GEMM, bli_gemmsup_ref,
|
||||||
cntx
|
//BLIS_GEMMT, bli_gemmtsup_ref,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
16,
|
|
||||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
|
|
||||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
#if 0
|
|
||||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
// NOTE: This set of kernels is likely broken and therefore disabled.
|
|
||||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
|
|
||||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
#endif
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
|
||||||
// values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
|
|
||||||
9, 9, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
|
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
|
||||||
// blocksizes for small/unpacked level-3 problems.
|
|
||||||
bli_cntx_set_l3_sup_blkszs
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/zen2/bli_kernel_defs_zen2.h
Normal file
52
config/zen2/bli_kernel_defs_zen2.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 6
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
#define BLIS_MR_c 3
|
||||||
|
#define BLIS_MR_z 3
|
||||||
|
|
||||||
|
#define BLIS_NR_s 16
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
#define BLIS_NR_c 8
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -37,83 +37,106 @@
|
|||||||
void bli_cntx_init_zen3( cntx_t* cntx )
|
void bli_cntx_init_zen3( cntx_t* cntx )
|
||||||
{
|
{
|
||||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
|
||||||
|
|
||||||
// Set default kernel blocksizes and functions.
|
// Set default kernel blocksizes and functions.
|
||||||
bli_cntx_init_zen3_ref( cntx );
|
bli_cntx_init_zen3_ref( cntx );
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Update the context with optimized native gemm micro-kernels and
|
// Update the context with optimized native gemm micro-kernels.
|
||||||
// their storage preferences.
|
bli_cntx_set_ukrs
|
||||||
bli_cntx_set_l3_nat_ukrs
|
|
||||||
(
|
(
|
||||||
8,
|
cntx,
|
||||||
|
|
||||||
// gemm
|
// gemm
|
||||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16,
|
||||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8,
|
||||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
|
||||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
|
||||||
|
|
||||||
// gemmtrsm_l
|
// gemmtrsm_l
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16,
|
||||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8,
|
||||||
|
|
||||||
// gemmtrsm_u
|
// gemmtrsm_u
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16,
|
||||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8,
|
||||||
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
|
// gemmsup
|
||||||
#if 0
|
#if 0
|
||||||
// AMD: This will be enabled in other PRs.
|
// AMD: This should be enabled in the PR which has added these kernels
|
||||||
// packm kernels
|
// Update the context with optimized small/unpacked gemm kernels.
|
||||||
bli_cntx_set_packm_kers
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
(
|
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
|
||||||
2,
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
cntx
|
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
|
||||||
);
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
|
||||||
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
|
||||||
#else
|
#else
|
||||||
// Update the context with optimized packm kernels.
|
BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
bli_cntx_set_packm_kers
|
BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
|
||||||
(
|
BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
8,
|
BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||||
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
|
||||||
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
|
||||||
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
|
||||||
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
cntx
|
BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
);
|
BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Update the context with optimized level-1f kernels.
|
// packm
|
||||||
bli_cntx_set_l1f_kers
|
#if 0
|
||||||
(
|
// AMD: This will be enabled in other PRs.
|
||||||
4,
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
|
||||||
|
#else
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
|
||||||
|
BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
|
||||||
|
BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
|
||||||
|
#endif
|
||||||
|
|
||||||
// axpyf
|
// axpyf
|
||||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
|
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
|
||||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
|
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
|
||||||
|
|
||||||
// dotxf
|
// dotxf
|
||||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||||
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
// Update the context with optimized level-1v kernels.
|
|
||||||
bli_cntx_set_l1v_kers
|
|
||||||
(
|
|
||||||
16,
|
|
||||||
|
|
||||||
// amaxv
|
// amaxv
|
||||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||||
@@ -135,19 +158,75 @@ void bli_cntx_init_zen3( cntx_t* cntx )
|
|||||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||||
|
|
||||||
//swap
|
// swapv
|
||||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||||
|
|
||||||
//copy
|
// copyv
|
||||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||||
|
|
||||||
//set
|
// setv
|
||||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
||||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
||||||
|
|
||||||
cntx
|
BLIS_VA_END
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the context with storage preferences.
|
||||||
|
bli_cntx_set_ukr_prefs
|
||||||
|
(
|
||||||
|
cntx,
|
||||||
|
|
||||||
|
// gemm
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
|
||||||
|
// gemmtrsm_l
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// gemmtrsm_u
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
|
||||||
|
#if 0
|
||||||
|
// AMD: This should be enabled in the PR which has added these kernels
|
||||||
|
// Update the context with optimized small/unpacked gemm kernels.
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
|
||||||
|
#endif
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||||
@@ -164,138 +243,67 @@ void bli_cntx_init_zen3( cntx_t* cntx )
|
|||||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize sup thresholds with architecture-appropriate values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NT ], 200, 256, -1, -1 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KT ], 240, 220, -1, -1 );
|
||||||
|
|
||||||
|
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||||
|
// values.
|
||||||
|
// s d c z
|
||||||
|
bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3,
|
||||||
|
9, 9, 3, 3 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 );
|
||||||
|
bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 );
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
// Update the context with the current architecture's register and cache
|
||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 7,
|
cntx,
|
||||||
|
|
||||||
// level-3
|
// level-3
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||||
|
|
||||||
// level-1f
|
// level-1f
|
||||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||||
cntx
|
|
||||||
|
// sup thresholds
|
||||||
|
BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
|
||||||
|
BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
|
||||||
|
BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
|
||||||
|
|
||||||
|
// gemmsup
|
||||||
|
BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
|
||||||
|
BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
|
||||||
|
BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
|
||||||
|
BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
|
||||||
|
BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
|
||||||
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
// Initialize sup thresholds with architecture-appropriate values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 );
|
|
||||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 );
|
|
||||||
|
|
||||||
// Initialize the context with the sup thresholds.
|
|
||||||
bli_cntx_set_l3_sup_thresh
|
|
||||||
(
|
|
||||||
3,
|
|
||||||
BLIS_MT, &thresh[ BLIS_MT ],
|
|
||||||
BLIS_NT, &thresh[ BLIS_NT ],
|
|
||||||
BLIS_KT, &thresh[ BLIS_KT ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// Initialize the context with the sup handlers.
|
// Initialize the context with the sup handlers.
|
||||||
bli_cntx_set_l3_sup_handlers
|
bli_cntx_set_l3_sup_handlers
|
||||||
(
|
(
|
||||||
2,
|
cntx,
|
||||||
|
|
||||||
BLIS_GEMM, bli_gemmsup_ref,
|
BLIS_GEMM, bli_gemmsup_ref,
|
||||||
BLIS_GEMMT, bli_gemmtsup_ref,
|
//BLIS_GEMMT, bli_gemmtsup_ref,
|
||||||
cntx
|
|
||||||
|
BLIS_VA_END
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0
|
|
||||||
// AMD: This should be enabled in the PR which has added these kernels
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
28,
|
|
||||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
|
||||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
|
||||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
|
||||||
bli_cntx_set_l3_sup_kers
|
|
||||||
(
|
|
||||||
16,
|
|
||||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
|
||||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
|
||||||
|
|
||||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
|
|
||||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
|
||||||
// values.
|
|
||||||
// s d c z
|
|
||||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
|
|
||||||
9, 9, 3, 3 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
|
|
||||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
|
|
||||||
|
|
||||||
// Update the context with the current architecture's register and cache
|
|
||||||
// blocksizes for small/unpacked level-3 problems.
|
|
||||||
bli_cntx_set_l3_sup_blkszs
|
|
||||||
(
|
|
||||||
5,
|
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
|
||||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
|
||||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
|
||||||
cntx
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
config/zen3/bli_kernel_defs_zen3.h
Normal file
52
config/zen3/bli_kernel_defs_zen3.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2022, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//#ifndef BLIS_KERNEL_DEFS_H
|
||||||
|
//#define BLIS_KERNEL_DEFS_H
|
||||||
|
|
||||||
|
|
||||||
|
// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
|
||||||
|
|
||||||
|
#define BLIS_MR_s 6
|
||||||
|
#define BLIS_MR_d 6
|
||||||
|
#define BLIS_MR_c 3
|
||||||
|
#define BLIS_MR_z 3
|
||||||
|
|
||||||
|
#define BLIS_NR_s 16
|
||||||
|
#define BLIS_NR_d 8
|
||||||
|
#define BLIS_NR_c 8
|
||||||
|
#define BLIS_NR_z 4
|
||||||
|
|
||||||
|
//#endif
|
||||||
|
|
||||||
@@ -127,7 +127,7 @@ void bli_cntx_init_fooarch( cntx_t* cntx )
|
|||||||
// blocksizes (and multiples) for native execution.
|
// blocksizes (and multiples) for native execution.
|
||||||
bli_cntx_set_blkszs
|
bli_cntx_set_blkszs
|
||||||
(
|
(
|
||||||
BLIS_NAT, 5,
|
5,
|
||||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||||
|
|||||||
@@ -61,15 +61,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjx, \
|
conjx, \
|
||||||
n, \
|
n, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -98,14 +98,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
n, \
|
n, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
index, \
|
index, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -135,17 +135,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjx, \
|
conjx, \
|
||||||
n, \
|
n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
beta, \
|
beta, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,16 +175,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
if ( cntx == NULL ) \
|
if ( cntx == NULL ) \
|
||||||
cntx = bli_gks_query_cntx(); \
|
cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjx, \
|
conjx, \
|
||||||
n, \
|
n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -215,17 +215,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjx, \
|
conjx, \
|
||||||
conjy, \
|
conjy, \
|
||||||
n, \
|
n, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
rho, \
|
rho, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -257,19 +257,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjx, \
|
conjx, \
|
||||||
conjy, \
|
conjy, \
|
||||||
n, \
|
n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
beta, \
|
beta, \
|
||||||
rho, \
|
rho, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -295,13 +295,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
n, \
|
n, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -329,15 +329,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjalpha, \
|
conjalpha, \
|
||||||
n, \
|
n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -365,14 +365,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
n, \
|
n, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -400,16 +400,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjx, \
|
conjx, \
|
||||||
n, \
|
n, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
beta, \
|
beta, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
{ \
|
{ \
|
||||||
const num_t dt = PASTEMAC(ch,type); \
|
const num_t dt = PASTEMAC(ch,type); \
|
||||||
\
|
\
|
||||||
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
|
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
copyv_p \
|
copyv_p \
|
||||||
( \
|
( \
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
{ \
|
{ \
|
||||||
const num_t dt = PASTEMAC(ch,type); \
|
const num_t dt = PASTEMAC(ch,type); \
|
||||||
\
|
\
|
||||||
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
|
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
copyv_p \
|
copyv_p \
|
||||||
( \
|
( \
|
||||||
|
|||||||
@@ -85,32 +85,33 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
\
|
\
|
||||||
if ( bli_is_nonunit_diag( diagx ) ) \
|
if ( bli_is_nonunit_diag( diagx ) ) \
|
||||||
{ \
|
{ \
|
||||||
x1 = x + offx; \
|
x1 = x + offx; \
|
||||||
y1 = y + offy; \
|
y1 = y + offy; \
|
||||||
} \
|
} \
|
||||||
else /* if ( bli_is_unit_diag( diagx ) ) */ \
|
else /* if ( bli_is_unit_diag( diagx ) ) */ \
|
||||||
{ \
|
{ \
|
||||||
/* Simulate a unit diagonal for x with a zero increment over a unit
|
/* Simulate a unit diagonal for x with a zero increment over a unit
|
||||||
scalar. */ \
|
scalar. */ \
|
||||||
x1 = PASTEMAC(ch,1); \
|
x1 = PASTEMAC(ch,1); \
|
||||||
incx = 0; \
|
incx = 0; \
|
||||||
y1 = y + offy; \
|
y1 = y + offy; \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
/* Query the context for the operation's kernel address. */ \
|
/* Query the context for the operation's kernel address. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Invoke the kernel with the appropriate parameters. */ \
|
/* Invoke the kernel with the appropriate parameters. */ \
|
||||||
f( \
|
f \
|
||||||
conjx, \
|
( \
|
||||||
n_elem, \
|
conjx, \
|
||||||
x1, incx, \
|
n_elem, \
|
||||||
y1, incy, \
|
x1, incx, \
|
||||||
cntx \
|
y1, incy, \
|
||||||
); \
|
cntx \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC2( addd, addv, BLIS_ADDV_KER )
|
INSERT_GENTFUNC_BASIC2( addd, addv, BLIS_ADDV_KER )
|
||||||
@@ -164,33 +165,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
\
|
\
|
||||||
if ( bli_is_nonunit_diag( diagx ) ) \
|
if ( bli_is_nonunit_diag( diagx ) ) \
|
||||||
{ \
|
{ \
|
||||||
x1 = x + offx; \
|
x1 = x + offx; \
|
||||||
y1 = y + offy; \
|
y1 = y + offy; \
|
||||||
} \
|
} \
|
||||||
else /* if ( bli_is_unit_diag( diagx ) ) */ \
|
else /* if ( bli_is_unit_diag( diagx ) ) */ \
|
||||||
{ \
|
{ \
|
||||||
/* Simulate a unit diagonal for x with a zero increment over a unit
|
/* Simulate a unit diagonal for x with a zero increment over a unit
|
||||||
scalar. */ \
|
scalar. */ \
|
||||||
x1 = PASTEMAC(ch,1); \
|
x1 = PASTEMAC(ch,1); \
|
||||||
incx = 0; \
|
incx = 0; \
|
||||||
y1 = y + offy; \
|
y1 = y + offy; \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
/* Query the context for the operation's kernel address. */ \
|
/* Query the context for the operation's kernel address. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Invoke the kernel with the appropriate parameters. */ \
|
/* Invoke the kernel with the appropriate parameters. */ \
|
||||||
f( \
|
f \
|
||||||
conjx, \
|
( \
|
||||||
n_elem, \
|
conjx, \
|
||||||
alpha, \
|
n_elem, \
|
||||||
x1, incx, \
|
alpha, \
|
||||||
y1, incy, \
|
x1, incx, \
|
||||||
cntx \
|
y1, incy, \
|
||||||
); \
|
cntx \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC2( axpyd, axpyv, BLIS_AXPYV_KER )
|
INSERT_GENTFUNC_BASIC2( axpyd, axpyv, BLIS_AXPYV_KER )
|
||||||
@@ -233,20 +235,21 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
&offx, &n_elem, &incx \
|
&offx, &n_elem, &incx \
|
||||||
); \
|
); \
|
||||||
\
|
\
|
||||||
x1 = x + offx; \
|
x1 = x + offx; \
|
||||||
\
|
\
|
||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
/* Query the context for the operation's kernel address. */ \
|
/* Query the context for the operation's kernel address. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Invoke the kernel with the appropriate parameters. */ \
|
/* Invoke the kernel with the appropriate parameters. */ \
|
||||||
f( \
|
f \
|
||||||
n_elem, \
|
( \
|
||||||
x1, incx, \
|
n_elem, \
|
||||||
cntx \
|
x1, incx, \
|
||||||
); \
|
cntx \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
|
INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
|
||||||
@@ -290,22 +293,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
&offx, &n_elem, &incx \
|
&offx, &n_elem, &incx \
|
||||||
); \
|
); \
|
||||||
\
|
\
|
||||||
x1 = x + offx; \
|
x1 = x + offx; \
|
||||||
\
|
\
|
||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
/* Query the context for the operation's kernel address. */ \
|
/* Query the context for the operation's kernel address. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Invoke the kernel with the appropriate parameters. */ \
|
/* Invoke the kernel with the appropriate parameters. */ \
|
||||||
f( \
|
f \
|
||||||
conjalpha, \
|
( \
|
||||||
n_elem, \
|
conjalpha, \
|
||||||
alpha, \
|
n_elem, \
|
||||||
x1, incx, \
|
alpha, \
|
||||||
cntx \
|
x1, incx, \
|
||||||
); \
|
cntx \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
|
INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
|
||||||
@@ -361,27 +365,28 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
PASTEMAC(ch,setis)( *alpha, *chi11 ); \
|
PASTEMAC(ch,setis)( *alpha, *chi11 ); \
|
||||||
} */ \
|
} */ \
|
||||||
\
|
\
|
||||||
/* Acquire the addres of the imaginary component of the first element,
|
/* Acquire the address of the imaginary component of the first element,
|
||||||
and scale the increment for use in the real domain. Note that the
|
and scale the increment for use in the real domain. Note that the
|
||||||
indexing into the imaginary field only needs to work for complex
|
indexing into the imaginary field only needs to work for complex
|
||||||
datatypes since we return early for real domain types. */ \
|
datatypes since we return early for real domain types. */ \
|
||||||
x1 = ( ctype_r* )( x + offx ) + 1; \
|
x1 = ( ctype_r* )( x + offx ) + 1; \
|
||||||
incx = 2*incx; \
|
incx = 2*incx; \
|
||||||
\
|
\
|
||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
/* Query the context for the operation's kernel address. */ \
|
/* Query the context for the operation's kernel address. */ \
|
||||||
PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \
|
PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Invoke the kernel with the appropriate parameters. */ \
|
/* Invoke the kernel with the appropriate parameters. */ \
|
||||||
f( \
|
f \
|
||||||
BLIS_NO_CONJUGATE, \
|
( \
|
||||||
n_elem, \
|
BLIS_NO_CONJUGATE, \
|
||||||
alpha, \
|
n_elem, \
|
||||||
x1, incx, \
|
alpha, \
|
||||||
cntx \
|
x1, incx, \
|
||||||
); \
|
cntx \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
|
INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
|
||||||
@@ -424,22 +429,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
&offx, &n_elem, &incx \
|
&offx, &n_elem, &incx \
|
||||||
); \
|
); \
|
||||||
\
|
\
|
||||||
x1 = x + offx; \
|
x1 = x + offx; \
|
||||||
\
|
\
|
||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
/* Query the context for the operation's kernel address. */ \
|
/* Query the context for the operation's kernel address. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Invoke the kernel with the appropriate parameters. */ \
|
/* Invoke the kernel with the appropriate parameters. */ \
|
||||||
f( \
|
f \
|
||||||
BLIS_NO_CONJUGATE, \
|
( \
|
||||||
n_elem, \
|
BLIS_NO_CONJUGATE, \
|
||||||
alpha, 0, \
|
n_elem, \
|
||||||
x1, incx, \
|
alpha, 0, \
|
||||||
cntx \
|
x1, incx, \
|
||||||
); \
|
cntx \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
|
INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
|
||||||
@@ -491,33 +497,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
\
|
\
|
||||||
if ( bli_is_nonunit_diag( diagx ) ) \
|
if ( bli_is_nonunit_diag( diagx ) ) \
|
||||||
{ \
|
{ \
|
||||||
x1 = x + offx; \
|
x1 = x + offx; \
|
||||||
y1 = y + offy; \
|
y1 = y + offy; \
|
||||||
} \
|
} \
|
||||||
else /* if ( bli_is_unit_diag( diagx ) ) */ \
|
else /* if ( bli_is_unit_diag( diagx ) ) */ \
|
||||||
{ \
|
{ \
|
||||||
/* Simulate a unit diagonal for x with a zero increment over a unit
|
/* Simulate a unit diagonal for x with a zero increment over a unit
|
||||||
scalar. */ \
|
scalar. */ \
|
||||||
x1 = PASTEMAC(ch,1); \
|
x1 = PASTEMAC(ch,1); \
|
||||||
incx = 0; \
|
incx = 0; \
|
||||||
y1 = y + offy; \
|
y1 = y + offy; \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
/* Query the context for the operation's kernel address. */ \
|
/* Query the context for the operation's kernel address. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Invoke the kernel with the appropriate parameters. */ \
|
/* Invoke the kernel with the appropriate parameters. */ \
|
||||||
f( \
|
f \
|
||||||
conjx, \
|
( \
|
||||||
n_elem, \
|
conjx, \
|
||||||
x1, incx, \
|
n_elem, \
|
||||||
beta, \
|
x1, incx, \
|
||||||
y1, incy, \
|
beta, \
|
||||||
cntx \
|
y1, incy, \
|
||||||
); \
|
cntx \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER )
|
INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER )
|
||||||
|
|||||||
@@ -65,19 +65,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjx, \
|
conjx, \
|
||||||
conjy, \
|
conjy, \
|
||||||
n, \
|
n, \
|
||||||
alphax, \
|
alphax, \
|
||||||
alphay, \
|
alphay, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
z, incz, \
|
z, incz, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,19 +109,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conja, \
|
conja, \
|
||||||
conjx, \
|
conjx, \
|
||||||
m, \
|
m, \
|
||||||
b_n, \
|
b_n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
a, inca, lda, \
|
a, inca, lda, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -154,20 +154,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjxt, \
|
conjxt, \
|
||||||
conjx, \
|
conjx, \
|
||||||
conjy, \
|
conjy, \
|
||||||
n, \
|
n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
rho, \
|
rho, \
|
||||||
z, incz, \
|
z, incz, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -204,24 +204,24 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjat, \
|
conjat, \
|
||||||
conja, \
|
conja, \
|
||||||
conjw, \
|
conjw, \
|
||||||
conjx, \
|
conjx, \
|
||||||
m, \
|
m, \
|
||||||
b_n, \
|
b_n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
a, inca, lda, \
|
a, inca, lda, \
|
||||||
w, incw, \
|
w, incw, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
beta, \
|
beta, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
z, incz, \
|
z, incz, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -254,20 +254,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
|||||||
/* Obtain a valid context from the gks if necessary. */ \
|
/* Obtain a valid context from the gks if necessary. */ \
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
conjat, \
|
conjat, \
|
||||||
conjx, \
|
conjx, \
|
||||||
m, \
|
m, \
|
||||||
b_n, \
|
b_n, \
|
||||||
alpha, \
|
alpha, \
|
||||||
a, inca, lda, \
|
a, inca, lda, \
|
||||||
x, incx, \
|
x, incx, \
|
||||||
beta, \
|
beta, \
|
||||||
y, incy, \
|
y, incy, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -102,35 +102,40 @@ INSERT_GENTDEF( packm_cxk )
|
|||||||
\
|
\
|
||||||
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
|
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
|
||||||
( \
|
( \
|
||||||
conj_t conjp, \
|
conj_t conja, \
|
||||||
|
pack_t schema, \
|
||||||
|
dim_t cdim, \
|
||||||
dim_t n, \
|
dim_t n, \
|
||||||
ctype* restrict kappa, \
|
ctype* restrict kappa, \
|
||||||
ctype* restrict p, inc_t ldp, \
|
ctype* restrict p, inc_t ldp, \
|
||||||
ctype* restrict a, inc_t inca, inc_t lda, \
|
ctype* restrict a, inc_t inca, inc_t lda, \
|
||||||
cntx_t* restrict cntx \
|
cntx_t* restrict cntx \
|
||||||
);
|
);
|
||||||
|
|
||||||
INSERT_GENTDEF( unpackm_cxk )
|
INSERT_GENTDEF( unpackm_cxk )
|
||||||
|
|
||||||
// packm_1er_ker
|
// packm_diag_ker
|
||||||
|
|
||||||
#undef GENTDEF
|
#undef GENTDEF
|
||||||
#define GENTDEF( ctype, ch, opname, tsuf ) \
|
#define GENTDEF( ctype, ch, opname, tsuf ) \
|
||||||
\
|
\
|
||||||
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
|
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
|
||||||
( \
|
( \
|
||||||
|
struc_t struca, \
|
||||||
|
diag_t diaga, \
|
||||||
|
uplo_t uploa, \
|
||||||
conj_t conja, \
|
conj_t conja, \
|
||||||
pack_t schema, \
|
pack_t schema, \
|
||||||
|
bool invdiag, \
|
||||||
dim_t cdim, \
|
dim_t cdim, \
|
||||||
dim_t n, \
|
|
||||||
dim_t n_max, \
|
dim_t n_max, \
|
||||||
ctype* restrict kappa, \
|
ctype* restrict kappa, \
|
||||||
ctype* restrict a, inc_t inca, inc_t lda, \
|
ctype* restrict a, inc_t inca, inc_t lda, \
|
||||||
ctype* restrict p, inc_t ldp, \
|
ctype* restrict p, inc_t ldp, \
|
||||||
cntx_t* restrict cntx \
|
cntx_t* restrict cntx \
|
||||||
);
|
);
|
||||||
|
|
||||||
INSERT_GENTDEF( packm_cxk_1er )
|
INSERT_GENTDEF( packm_cxc_diag )
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -47,16 +47,8 @@
|
|||||||
#undef GENTPROT
|
#undef GENTPROT
|
||||||
#define GENTPROT PACKM_KER_PROT
|
#define GENTPROT PACKM_KER_PROT
|
||||||
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_2xk_ker_name )
|
INSERT_GENTPROT_BASIC0( packm_mrxk_ker_name )
|
||||||
INSERT_GENTPROT_BASIC0( packm_3xk_ker_name )
|
INSERT_GENTPROT_BASIC0( packm_nrxk_ker_name )
|
||||||
INSERT_GENTPROT_BASIC0( packm_4xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_6xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_8xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_10xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_12xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_14xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_16xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
|
|
||||||
|
|
||||||
|
|
||||||
// native unpackm kernels
|
// native unpackm kernels
|
||||||
@@ -64,27 +56,33 @@ INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
|
|||||||
#undef GENTPROT
|
#undef GENTPROT
|
||||||
#define GENTPROT UNPACKM_KER_PROT
|
#define GENTPROT UNPACKM_KER_PROT
|
||||||
|
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name )
|
INSERT_GENTPROT_BASIC0( unpackm_mrxk_ker_name )
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name )
|
INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name )
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name )
|
|
||||||
|
|
||||||
|
|
||||||
// 1e/1r packm kernels
|
// 1e/1r packm kernels
|
||||||
|
|
||||||
#undef GENTPROT
|
#undef GENTPROT
|
||||||
#define GENTPROT PACKM_1ER_KER_PROT
|
#define GENTPROT PACKM_KER_PROT
|
||||||
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name )
|
INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name )
|
||||||
INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name )
|
INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name )
|
||||||
INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name )
|
// packm kernels for diagonal blocks
|
||||||
INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name )
|
|
||||||
INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name )
|
#undef GENTPROT
|
||||||
INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name )
|
#define GENTPROT PACKM_DIAG_KER_PROT
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_ker_name )
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_ker_name )
|
||||||
|
|
||||||
|
|
||||||
|
// 1e/1r packm kernels for diagonal blocks
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT PACKM_DIAG_KER_PROT
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_1er_ker_name )
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_1er_ker_name )
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,7 @@
|
|||||||
// Define template prototypes for level-1m kernels.
|
// Define template prototypes for level-1m kernels.
|
||||||
//
|
//
|
||||||
|
|
||||||
// native packm kernels
|
// packm kernels
|
||||||
|
|
||||||
#define PACKM_KER_PROT( ctype, ch, varname ) \
|
#define PACKM_KER_PROT( ctype, ch, varname ) \
|
||||||
\
|
\
|
||||||
@@ -55,35 +55,40 @@ void PASTEMAC(ch,varname) \
|
|||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
// native unpackm kernels
|
// unpackm kernels
|
||||||
|
|
||||||
#define UNPACKM_KER_PROT( ctype, ch, varname ) \
|
#define UNPACKM_KER_PROT( ctype, ch, varname ) \
|
||||||
\
|
\
|
||||||
void PASTEMAC(ch,varname) \
|
|
||||||
( \
|
|
||||||
conj_t conja, \
|
|
||||||
dim_t n, \
|
|
||||||
ctype* restrict kappa, \
|
|
||||||
ctype* restrict p, inc_t ldp, \
|
|
||||||
ctype* restrict a, inc_t inca, inc_t lda, \
|
|
||||||
cntx_t* restrict cntx \
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
// 1e/1r packm kernels
|
|
||||||
|
|
||||||
#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \
|
|
||||||
\
|
|
||||||
void PASTEMAC(ch,varname) \
|
void PASTEMAC(ch,varname) \
|
||||||
( \
|
( \
|
||||||
conj_t conja, \
|
conj_t conja, \
|
||||||
pack_t schema, \
|
pack_t schema, \
|
||||||
dim_t cdim, \
|
dim_t cdim, \
|
||||||
dim_t n, \
|
dim_t n, \
|
||||||
|
ctype* restrict kappa, \
|
||||||
|
ctype* restrict p, inc_t ldp, \
|
||||||
|
ctype* restrict a, inc_t inca, inc_t lda, \
|
||||||
|
cntx_t* restrict cntx \
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// packm kernels for diagonal blocks
|
||||||
|
|
||||||
|
#define PACKM_DIAG_KER_PROT( ctype, ch, varname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,varname) \
|
||||||
|
( \
|
||||||
|
struc_t struca, \
|
||||||
|
diag_t diaga, \
|
||||||
|
uplo_t uploa, \
|
||||||
|
conj_t conja, \
|
||||||
|
pack_t schema, \
|
||||||
|
bool invdiag, \
|
||||||
|
dim_t cdim, \
|
||||||
dim_t n_max, \
|
dim_t n_max, \
|
||||||
ctype* restrict kappa, \
|
ctype* restrict kappa, \
|
||||||
ctype* restrict a, inc_t inca, inc_t lda, \
|
ctype* restrict a, inc_t inca, inc_t lda, \
|
||||||
ctype* restrict p, inc_t ldp, \
|
ctype* restrict p, inc_t ldp, \
|
||||||
cntx_t* restrict cntx \
|
cntx_t* restrict cntx \
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \
|
|||||||
conjx = bli_extract_conj( transx ); \
|
conjx = bli_extract_conj( transx ); \
|
||||||
\
|
\
|
||||||
/* Query the kernel needed for this operation. */ \
|
/* Query the kernel needed for this operation. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Handle dense and upper/lower storage cases separately. */ \
|
/* Handle dense and upper/lower storage cases separately. */ \
|
||||||
if ( bli_is_dense( uplox_eff ) ) \
|
if ( bli_is_dense( uplox_eff ) ) \
|
||||||
@@ -197,7 +197,7 @@ void PASTEMAC(ch,opname) \
|
|||||||
conjx = bli_extract_conj( transx ); \
|
conjx = bli_extract_conj( transx ); \
|
||||||
\
|
\
|
||||||
/* Query the kernel needed for this operation. */ \
|
/* Query the kernel needed for this operation. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Handle dense and upper/lower storage cases separately. */ \
|
/* Handle dense and upper/lower storage cases separately. */ \
|
||||||
if ( bli_is_dense( uplox_eff ) ) \
|
if ( bli_is_dense( uplox_eff ) ) \
|
||||||
@@ -310,7 +310,7 @@ void PASTEMAC(ch,opname) \
|
|||||||
if ( bli_is_zeros( uplox_eff ) ) return; \
|
if ( bli_is_zeros( uplox_eff ) ) return; \
|
||||||
\
|
\
|
||||||
/* Query the kernel needed for this operation. */ \
|
/* Query the kernel needed for this operation. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Handle dense and upper/lower storage cases separately. */ \
|
/* Handle dense and upper/lower storage cases separately. */ \
|
||||||
if ( bli_is_dense( uplox_eff ) ) \
|
if ( bli_is_dense( uplox_eff ) ) \
|
||||||
@@ -423,7 +423,7 @@ void PASTEMAC(ch,opname) \
|
|||||||
conjx = bli_extract_conj( transx ); \
|
conjx = bli_extract_conj( transx ); \
|
||||||
\
|
\
|
||||||
/* Query the kernel needed for this operation. */ \
|
/* Query the kernel needed for this operation. */ \
|
||||||
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
|
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
|
||||||
\
|
\
|
||||||
/* Handle dense and upper/lower storage cases separately. */ \
|
/* Handle dense and upper/lower storage cases separately. */ \
|
||||||
if ( bli_is_dense( uplox_eff ) ) \
|
if ( bli_is_dense( uplox_eff ) ) \
|
||||||
|
|||||||
@@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \
|
|||||||
/* Note that we use panel_dim_max, not panel_dim, to query the packm
|
/* Note that we use panel_dim_max, not panel_dim, to query the packm
|
||||||
kernel function pointer. This means that we always use the same
|
kernel function pointer. This means that we always use the same
|
||||||
kernel, even for edge cases. */ \
|
kernel, even for edge cases. */ \
|
||||||
num_t dt = PASTEMAC(ch,type); \
|
num_t dt = PASTEMAC(ch,type); \
|
||||||
l1mkr_t ker_id = panel_dim_max; \
|
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
|
||||||
|
: BLIS_PACKM_MRXK_KER; \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f; \
|
PASTECH2(ch,opname,_ker_ft) f; \
|
||||||
\
|
\
|
||||||
/* Query the context for the packm kernel corresponding to the current
|
/* Query the context for the packm kernel corresponding to the current
|
||||||
panel dimension, or kernel id. If the id is invalid, the function will
|
panel dimension, or kernel id. If the id is invalid, the function will
|
||||||
return NULL. */ \
|
return NULL. */ \
|
||||||
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
|
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
|
||||||
\
|
\
|
||||||
/* If there exists a kernel implementation for the micro-panel dimension
|
/* If there exists a kernel implementation for the micro-panel dimension
|
||||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||||
@@ -91,30 +92,30 @@ void PASTEMAC(ch,opname) \
|
|||||||
that happens, the packm kernel must have set the 0's added in
|
that happens, the packm kernel must have set the 0's added in
|
||||||
step (3) below.
|
step (3) below.
|
||||||
|
|
||||||
packm kernel packm kernel packm kernel packm_tri_cxk
|
packm kernel packm kernel packm kernel packm_tri_cxk
|
||||||
step 1: step 2: step 3: step 4:
|
step 1: step 2: step 3: step 4:
|
||||||
|
|
||||||
x x x x . . x x x x . . x x x x 0 0 x x x x 0 0
|
x x x x . . x x x x . . x x x x 0 0 x x x x 0 0
|
||||||
? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0
|
? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0
|
||||||
? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0
|
? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0
|
||||||
? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0
|
? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0
|
||||||
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
|
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
|
||||||
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
|
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
|
||||||
|
|
||||||
x Copied from A; valid element.
|
x Copied from A; valid element.
|
||||||
? Copied from A, but value is unknown and unused.
|
? Copied from A, but value is unknown and unused.
|
||||||
. Uninitialized.
|
. Uninitialized.
|
||||||
0 Initialized to zero.
|
0 Initialized to zero.
|
||||||
1 Initialized to one.
|
1 Initialized to one.
|
||||||
|
|
||||||
NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
|
NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
|
||||||
to zero. This is not needed to support trsm, but rather to
|
to zero. This is not needed to support trsm, but rather to
|
||||||
support trmm. (Both use the same packing format and code.)
|
support trmm. (Both use the same packing format and code.)
|
||||||
|
|
||||||
In this case, panel_dim will be 4 because four rows of data are
|
In this case, panel_dim will be 4 because four rows of data are
|
||||||
copied from A, panel_len will be 4 because those four rows span
|
copied from A, panel_len will be 4 because those four rows span
|
||||||
four columns of A, and panel_len_max will be 6 because there are a
|
four columns of A, and panel_len_max will be 6 because there are a
|
||||||
total of 6 columns that can be written to in the packed micropanel,
|
total of 6 columns that can be written to in the packed micropanel,
|
||||||
2 of which lie beyond the values copied from A. */ \
|
2 of which lie beyond the values copied from A. */ \
|
||||||
f \
|
f \
|
||||||
( \
|
( \
|
||||||
@@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \
|
|||||||
/* Note that we use panel_dim_max, not panel_dim, to query the packm
|
/* Note that we use panel_dim_max, not panel_dim, to query the packm
|
||||||
kernel function pointer. This means that we always use the same
|
kernel function pointer. This means that we always use the same
|
||||||
kernel, even for edge cases. */ \
|
kernel, even for edge cases. */ \
|
||||||
num_t dt = PASTEMAC(ch,type); \
|
num_t dt = PASTEMAC(ch,type); \
|
||||||
l1mkr_t ker_id = panel_dim_max; \
|
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
|
||||||
|
: BLIS_PACKM_MRXK_1ER_KER; \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f; \
|
PASTECH2(ch,opname,_ker_ft) f; \
|
||||||
\
|
\
|
||||||
/* Query the context for the packm kernel corresponding to the current
|
/* Query the context for the packm kernel corresponding to the current
|
||||||
panel dimension, or kernel id. If the id is invalid, the function will
|
panel dimension, or kernel id. If the id is invalid, the function will
|
||||||
return NULL. */ \
|
return NULL. */ \
|
||||||
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
|
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
|
||||||
\
|
\
|
||||||
/* If there exists a kernel implementation for the micro-panel dimension
|
/* If there exists a kernel implementation for the micro-panel dimension
|
||||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||||
@@ -40,6 +40,7 @@
|
|||||||
void PASTEMAC(ch,opname) \
|
void PASTEMAC(ch,opname) \
|
||||||
( \
|
( \
|
||||||
conj_t conjp, \
|
conj_t conjp, \
|
||||||
|
pack_t schema, \
|
||||||
dim_t panel_dim, \
|
dim_t panel_dim, \
|
||||||
dim_t panel_len, \
|
dim_t panel_len, \
|
||||||
ctype* kappa, \
|
ctype* kappa, \
|
||||||
@@ -48,15 +49,16 @@ void PASTEMAC(ch,opname) \
|
|||||||
cntx_t* cntx \
|
cntx_t* cntx \
|
||||||
) \
|
) \
|
||||||
{ \
|
{ \
|
||||||
num_t dt = PASTEMAC(ch,type); \
|
num_t dt = PASTEMAC(ch,type); \
|
||||||
l1mkr_t ker_id = panel_dim; \
|
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
|
||||||
|
: BLIS_UNPACKM_MRXK_KER; \
|
||||||
\
|
\
|
||||||
PASTECH2(ch,opname,_ker_ft) f; \
|
PASTECH2(ch,opname,_ker_ft) f; \
|
||||||
\
|
\
|
||||||
/* Query the context for the unpackm kernel corresponding to the current
|
/* Query the context for the unpackm kernel corresponding to the current
|
||||||
panel dimension, or kernel id. If the id is invalid, the function will
|
panel dimension, or kernel id. If the id is invalid, the function will
|
||||||
return NULL. */ \
|
return NULL. */ \
|
||||||
f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \
|
f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
|
||||||
\
|
\
|
||||||
/* If there exists a kernel implementation for the micro-panel dimension
|
/* If there exists a kernel implementation for the micro-panel dimension
|
||||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||||
@@ -39,6 +39,7 @@
|
|||||||
void PASTEMAC(ch,varname) \
|
void PASTEMAC(ch,varname) \
|
||||||
( \
|
( \
|
||||||
conj_t conjp, \
|
conj_t conjp, \
|
||||||
|
pack_t schema, \
|
||||||
dim_t panel_dim, \
|
dim_t panel_dim, \
|
||||||
dim_t panel_len, \
|
dim_t panel_len, \
|
||||||
ctype* kappa, \
|
ctype* kappa, \
|
||||||
@@ -43,10 +43,6 @@
|
|||||||
#include "bli_packm_part.h"
|
#include "bli_packm_part.h"
|
||||||
|
|
||||||
#include "bli_packm_struc_cxk.h"
|
#include "bli_packm_struc_cxk.h"
|
||||||
#include "bli_packm_struc_cxk_1er.h"
|
|
||||||
|
|
||||||
#include "bli_packm_cxk.h"
|
|
||||||
#include "bli_packm_cxk_1er.h"
|
|
||||||
|
|
||||||
// Mixed datatype support.
|
// Mixed datatype support.
|
||||||
#ifdef BLIS_ENABLE_GEMM_MD
|
#ifdef BLIS_ENABLE_GEMM_MD
|
||||||
|
|||||||
@@ -43,11 +43,11 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
|||||||
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
|
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
|
||||||
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
|
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
|
||||||
// 0001 row/col panels: 1m-expanded (1e)
|
// 0001 row/col panels: 1m-expanded (1e)
|
||||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
{ { NULL, bli_cpackm_struc_cxk,
|
||||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
NULL, bli_zpackm_struc_cxk, } },
|
||||||
// 0010 row/col panels: 1m-reordered (1r)
|
// 0010 row/col panels: 1m-reordered (1r)
|
||||||
{ { NULL, bli_cpackm_struc_cxk_1er,
|
{ { NULL, bli_cpackm_struc_cxk,
|
||||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
NULL, bli_zpackm_struc_cxk, } },
|
||||||
};
|
};
|
||||||
|
|
||||||
static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
|
static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
|
||||||
|
|||||||
@@ -34,8 +34,8 @@
|
|||||||
|
|
||||||
#include "blis.h"
|
#include "blis.h"
|
||||||
|
|
||||||
#undef GENTFUNC
|
#undef GENTFUNCR
|
||||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, cxk_kername, cxc_kername ) \
|
||||||
\
|
\
|
||||||
void PASTEMAC(ch,varname) \
|
void PASTEMAC(ch,varname) \
|
||||||
( \
|
( \
|
||||||
@@ -58,460 +58,249 @@ void PASTEMAC(ch,varname) \
|
|||||||
cntx_t* cntx \
|
cntx_t* cntx \
|
||||||
) \
|
) \
|
||||||
{ \
|
{ \
|
||||||
/* Handle micro-panel packing based on the structure of the matrix
|
num_t dt = PASTEMAC(ch,type); \
|
||||||
being packed. */ \
|
num_t dt_r = PASTEMAC(chr,type); \
|
||||||
if ( bli_is_general( strucc ) ) \
|
dim_t panel_len_pad = panel_len_max - panel_len; \
|
||||||
{ \
|
|
||||||
/* For micro-panels of general matrices, we can call the pack
|
|
||||||
kernel front-end directly. */ \
|
|
||||||
PASTEMAC(ch,kername) \
|
|
||||||
( \
|
|
||||||
conjc, \
|
|
||||||
schema, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_dim_max, \
|
|
||||||
panel_len, \
|
|
||||||
panel_len_max, \
|
|
||||||
kappa, \
|
|
||||||
c, incc, ldc, \
|
|
||||||
p, ldp, \
|
|
||||||
cntx \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
|
||||||
{ \
|
|
||||||
/* Call a helper function for micro-panels of Hermitian/symmetric
|
|
||||||
matrices. */ \
|
|
||||||
PASTEMAC(ch,packm_herm_cxk) \
|
|
||||||
( \
|
|
||||||
strucc, \
|
|
||||||
diagc, \
|
|
||||||
uploc, \
|
|
||||||
conjc, \
|
|
||||||
schema, \
|
|
||||||
invdiag, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_len, \
|
|
||||||
panel_dim_max, \
|
|
||||||
panel_len_max, \
|
|
||||||
panel_dim_off, \
|
|
||||||
panel_len_off, \
|
|
||||||
kappa, \
|
|
||||||
c, incc, ldc, \
|
|
||||||
p, ldp, \
|
|
||||||
is_p, \
|
|
||||||
cntx \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
else /* ( bli_is_triangular( strucc ) ) */ \
|
|
||||||
{ \
|
|
||||||
/* Call a helper function for micro-panels of triangular
|
|
||||||
matrices. */ \
|
|
||||||
PASTEMAC(ch,packm_tri_cxk) \
|
|
||||||
( \
|
|
||||||
strucc, \
|
|
||||||
diagc, \
|
|
||||||
uploc, \
|
|
||||||
conjc, \
|
|
||||||
schema, \
|
|
||||||
invdiag, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_len, \
|
|
||||||
panel_dim_max, \
|
|
||||||
panel_len_max, \
|
|
||||||
panel_dim_off, \
|
|
||||||
panel_len_off, \
|
|
||||||
kappa, \
|
|
||||||
c, incc, ldc, \
|
|
||||||
p, ldp, \
|
|
||||||
is_p, \
|
|
||||||
cntx \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#undef GENTFUNC
|
|
||||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
|
||||||
\
|
\
|
||||||
void PASTEMAC(ch,varname) \
|
bszid_t bsz_id = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \
|
||||||
( \
|
dim_t packmrnr = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \
|
||||||
struc_t strucc, \
|
dim_t packmrnr_r = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \
|
||||||
diag_t diagc, \
|
\
|
||||||
uplo_t uploc, \
|
ukr_t cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
|
||||||
conj_t conjc, \
|
: BLIS_PACKM_MRXK_KER; \
|
||||||
pack_t schema, \
|
ukr_t cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER \
|
||||||
bool invdiag, \
|
: BLIS_PACKM_MRXMR_DIAG_KER; \
|
||||||
dim_t panel_dim, \
|
\
|
||||||
dim_t panel_len, \
|
if ( bli_is_1m_packed( schema ) ) \
|
||||||
dim_t panel_dim_max, \
|
{ \
|
||||||
dim_t panel_len_max, \
|
cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
|
||||||
dim_t panel_dim_off, \
|
: BLIS_PACKM_MRXK_1ER_KER; \
|
||||||
dim_t panel_len_off, \
|
cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER \
|
||||||
ctype* restrict kappa, \
|
: BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
|
||||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
} \
|
||||||
ctype* restrict p, inc_t ldp, \
|
\
|
||||||
inc_t is_p, \
|
PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
|
||||||
cntx_t* cntx \
|
PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
|
||||||
) \
|
\
|
||||||
{ \
|
/* For general matrices, pack and return early */ \
|
||||||
|
if ( bli_is_general( strucc ) ) \
|
||||||
|
{ \
|
||||||
|
f_cxk \
|
||||||
|
( \
|
||||||
|
conjc, \
|
||||||
|
schema, \
|
||||||
|
panel_dim, \
|
||||||
|
panel_len, \
|
||||||
|
panel_len_max, \
|
||||||
|
kappa, \
|
||||||
|
c, incc, ldc, \
|
||||||
|
p, ldp, \
|
||||||
|
cntx \
|
||||||
|
); \
|
||||||
|
return; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Sanity check. Diagonals should not intersect the short end of
|
||||||
|
a micro-panel. If they do, then somehow the constraints on
|
||||||
|
cache blocksizes being a whole multiple of the register
|
||||||
|
blocksizes was somehow violated. */ \
|
||||||
doff_t diagoffc = panel_dim_off - panel_len_off; \
|
doff_t diagoffc = panel_dim_off - panel_len_off; \
|
||||||
doff_t diagoffc_abs; \
|
if ( ( -panel_dim < diagoffc && diagoffc < 0 ) || \
|
||||||
dim_t i, j; \
|
( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \
|
||||||
|
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||||
\
|
\
|
||||||
/* Handle the case where the micro-panel does NOT intersect the
|
/* For triangular, symmetric, and hermitian matrices we need to consider
|
||||||
diagonal separately from the case where it does intersect. */ \
|
three parts. */ \
|
||||||
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
|
\
|
||||||
|
/* Pack to p10. */ \
|
||||||
|
if ( 0 < diagoffc ) \
|
||||||
{ \
|
{ \
|
||||||
/* If the current panel is unstored, we need to make a few
|
dim_t p10_dim = panel_dim; \
|
||||||
adjustments so we refer to the data where it is actually
|
dim_t p10_len = bli_min( diagoffc, panel_len ); \
|
||||||
stored, also taking conjugation into account. (Note this
|
dim_t p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
|
||||||
implicitly assumes we are operating on a dense panel
|
ctype* p10 = p; \
|
||||||
within a larger symmetric or Hermitian matrix, since a
|
conj_t conjc10 = conjc; \
|
||||||
general matrix would not contain any unstored region.) */ \
|
ctype* c10 = c; \
|
||||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
|
inc_t incc10 = incc; \
|
||||||
|
inc_t ldc10 = ldc; \
|
||||||
|
\
|
||||||
|
if ( bli_is_upper( uploc ) ) \
|
||||||
{ \
|
{ \
|
||||||
c = c + diagoffc * ( doff_t )ldc + \
|
bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \
|
||||||
-diagoffc * ( doff_t )incc; \
|
|
||||||
bli_swap_incs( &incc, &ldc ); \
|
|
||||||
\
|
|
||||||
if ( bli_is_hermitian( strucc ) ) \
|
|
||||||
bli_toggle_conj( &conjc ); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
/* Pack the full panel. */ \
|
|
||||||
PASTEMAC(ch,kername) \
|
|
||||||
( \
|
|
||||||
conjc, \
|
|
||||||
schema, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_dim_max, \
|
|
||||||
panel_len, \
|
|
||||||
panel_len_max, \
|
|
||||||
kappa, \
|
|
||||||
c, incc, ldc, \
|
|
||||||
p, ldp, \
|
|
||||||
cntx \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
|
|
||||||
{ \
|
|
||||||
ctype* restrict c10; \
|
|
||||||
ctype* restrict p10; \
|
|
||||||
dim_t p10_dim, p10_len; \
|
|
||||||
inc_t incc10, ldc10; \
|
|
||||||
doff_t diagoffc10; \
|
|
||||||
conj_t conjc10; \
|
|
||||||
\
|
|
||||||
ctype* restrict c12; \
|
|
||||||
ctype* restrict p12; \
|
|
||||||
dim_t p12_dim, p12_len; \
|
|
||||||
inc_t incc12, ldc12; \
|
|
||||||
doff_t diagoffc12; \
|
|
||||||
conj_t conjc12; \
|
|
||||||
\
|
|
||||||
/* Sanity check. Diagonals should not intersect the short end of
|
|
||||||
a micro-panel. If they do, then somehow the constraints on
|
|
||||||
cache blocksizes being a whole multiple of the register
|
|
||||||
blocksizes was somehow violated. */ \
|
|
||||||
if ( diagoffc < 0 ) \
|
|
||||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
|
||||||
\
|
|
||||||
diagoffc_abs = bli_abs( diagoffc ); \
|
|
||||||
\
|
|
||||||
if ( bli_is_lower( uploc ) ) \
|
|
||||||
{ \
|
|
||||||
p10_dim = panel_dim; \
|
|
||||||
p10_len = diagoffc_abs; \
|
|
||||||
p10 = p; \
|
|
||||||
c10 = c; \
|
|
||||||
incc10 = incc; \
|
|
||||||
ldc10 = ldc; \
|
|
||||||
conjc10 = conjc; \
|
|
||||||
\
|
|
||||||
p12_dim = panel_dim; \
|
|
||||||
p12_len = panel_len - p10_len; \
|
|
||||||
j = p10_len; \
|
|
||||||
diagoffc12 = diagoffc_abs - j; \
|
|
||||||
p12 = p + (j )*ldp; \
|
|
||||||
c12 = c + (j )*ldc; \
|
|
||||||
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
|
|
||||||
-diagoffc12 * ( doff_t )incc; \
|
|
||||||
incc12 = ldc; \
|
|
||||||
ldc12 = incc; \
|
|
||||||
conjc12 = conjc; \
|
|
||||||
\
|
|
||||||
if ( bli_is_hermitian( strucc ) ) \
|
|
||||||
bli_toggle_conj( &conjc12 ); \
|
|
||||||
} \
|
|
||||||
else /* if ( bli_is_upper( uploc ) ) */ \
|
|
||||||
{ \
|
|
||||||
p10_dim = panel_dim; \
|
|
||||||
p10_len = diagoffc_abs + panel_dim; \
|
|
||||||
diagoffc10 = diagoffc; \
|
|
||||||
p10 = p; \
|
|
||||||
c10 = c; \
|
|
||||||
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
|
|
||||||
-diagoffc10 * ( doff_t )incc; \
|
|
||||||
incc10 = ldc; \
|
|
||||||
ldc10 = incc; \
|
|
||||||
conjc10 = conjc; \
|
|
||||||
\
|
|
||||||
p12_dim = panel_dim; \
|
|
||||||
p12_len = panel_len - p10_len; \
|
|
||||||
j = p10_len; \
|
|
||||||
p12 = p + (j )*ldp; \
|
|
||||||
c12 = c + (j )*ldc; \
|
|
||||||
incc12 = incc; \
|
|
||||||
ldc12 = ldc; \
|
|
||||||
conjc12 = conjc; \
|
|
||||||
\
|
\
|
||||||
if ( bli_is_hermitian( strucc ) ) \
|
if ( bli_is_hermitian( strucc ) ) \
|
||||||
bli_toggle_conj( &conjc10 ); \
|
bli_toggle_conj( &conjc10 ); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
/* Pack to p10. For upper storage, this includes the unstored
|
/* If we are referencing the unstored part of a triangular matrix,
|
||||||
triangle of c11. */ \
|
explicitly store zeros */ \
|
||||||
/* NOTE: Since we're only packing partial panels here, we pass in
|
if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \
|
||||||
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
|
|
||||||
fill the columns up to panel_len_max, which is not what we need
|
|
||||||
or want to happen. */ \
|
|
||||||
PASTEMAC(ch,kername) \
|
|
||||||
( \
|
|
||||||
conjc10, \
|
|
||||||
schema, \
|
|
||||||
p10_dim, \
|
|
||||||
panel_dim_max, \
|
|
||||||
p10_len, \
|
|
||||||
p10_len, \
|
|
||||||
kappa, \
|
|
||||||
c10, incc10, ldc10, \
|
|
||||||
p10, ldp, \
|
|
||||||
cntx \
|
|
||||||
); \
|
|
||||||
\
|
|
||||||
/* Pack to p12. For lower storage, this includes the unstored
|
|
||||||
triangle of c11. */ \
|
|
||||||
/* NOTE: Since we're only packing partial panels here, we pass in
|
|
||||||
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
|
|
||||||
fill the columns up to panel_len_max, which is not what we need
|
|
||||||
or want to happen. */ \
|
|
||||||
PASTEMAC(ch,kername) \
|
|
||||||
( \
|
|
||||||
conjc12, \
|
|
||||||
schema, \
|
|
||||||
p12_dim, \
|
|
||||||
panel_dim_max, \
|
|
||||||
p12_len, \
|
|
||||||
p12_len, \
|
|
||||||
kappa, \
|
|
||||||
c12, incc12, ldc12, \
|
|
||||||
p12, ldp, \
|
|
||||||
cntx \
|
|
||||||
); \
|
|
||||||
\
|
|
||||||
/* Pack the stored triangle of c11 to p11. */ \
|
|
||||||
{ \
|
{ \
|
||||||
dim_t p11_m = panel_dim; \
|
if ( bli_is_1m_packed( schema ) ) \
|
||||||
dim_t p11_n = panel_dim; \
|
|
||||||
dim_t j2 = diagoffc_abs; \
|
|
||||||
ctype* restrict c11 = c + (j2 )*ldc; \
|
|
||||||
ctype* restrict p11 = p + (j2 )*ldp; \
|
|
||||||
trans_t transc = ( trans_t )conjc; \
|
|
||||||
\
|
|
||||||
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
|
|
||||||
( \
|
|
||||||
0, \
|
|
||||||
BLIS_NONUNIT_DIAG, \
|
|
||||||
uploc, \
|
|
||||||
transc, \
|
|
||||||
p11_m, \
|
|
||||||
p11_n, \
|
|
||||||
c11, incc, ldc, \
|
|
||||||
p11, 1, ldp, \
|
|
||||||
cntx, \
|
|
||||||
NULL \
|
|
||||||
); \
|
|
||||||
\
|
|
||||||
/* If source matrix c is Hermitian, we have to zero out the
|
|
||||||
imaginary components of the diagonal of p11 in case the
|
|
||||||
corresponding elements in c11 were not already zero. */ \
|
|
||||||
if ( bli_is_hermitian( strucc ) ) \
|
|
||||||
{ \
|
{ \
|
||||||
ctype* restrict pi11 = p11; \
|
ctype_r* restrict zero = PASTEMAC(chr,0); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < p11_m; ++i ) \
|
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
|
||||||
{ \
|
( \
|
||||||
PASTEMAC(ch,seti0s)( *pi11 ); \
|
BLIS_NO_CONJUGATE, \
|
||||||
\
|
0, \
|
||||||
pi11 += 1 + ldp; \
|
BLIS_NONUNIT_DIAG, \
|
||||||
} \
|
BLIS_DENSE, \
|
||||||
|
packmrnr_r, \
|
||||||
|
p10_len_max * 2, \
|
||||||
|
zero, \
|
||||||
|
( ctype_r* )p10, 1, ldp, \
|
||||||
|
cntx, \
|
||||||
|
NULL \
|
||||||
|
); \
|
||||||
} \
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||||
\
|
\
|
||||||
/* Now that the diagonal has been made explicitly Hermitian
|
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||||
(if applicable), we can now safely scale the stored
|
( \
|
||||||
triangle specified by uploc. */ \
|
BLIS_NO_CONJUGATE, \
|
||||||
PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \
|
0, \
|
||||||
|
BLIS_NONUNIT_DIAG, \
|
||||||
|
BLIS_DENSE, \
|
||||||
|
packmrnr, \
|
||||||
|
p10_len_max, \
|
||||||
|
zero, \
|
||||||
|
p10, 1, ldp, \
|
||||||
|
cntx, \
|
||||||
|
NULL \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
f_cxk \
|
||||||
( \
|
( \
|
||||||
BLIS_NO_CONJUGATE, \
|
conjc10, \
|
||||||
0, \
|
schema, \
|
||||||
BLIS_NONUNIT_DIAG, \
|
p10_dim, \
|
||||||
uploc, \
|
p10_len, \
|
||||||
p11_m, \
|
p10_len_max, \
|
||||||
p11_n, \
|
|
||||||
kappa, \
|
kappa, \
|
||||||
p11, 1, ldp, \
|
c10, incc10, ldc10, \
|
||||||
cntx, \
|
p10, ldp, \
|
||||||
NULL \
|
cntx \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Pack to p11. */ \
|
||||||
|
if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \
|
||||||
|
{ \
|
||||||
|
dim_t i = diagoffc; \
|
||||||
|
dim_t p11_dim = panel_dim; \
|
||||||
|
dim_t p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \
|
||||||
|
? panel_len_pad : 0 ); \
|
||||||
|
ctype* p11 = p + i * ldp; \
|
||||||
|
conj_t conjc11 = conjc; \
|
||||||
|
ctype* c11 = c + i * ldc; \
|
||||||
|
inc_t incc11 = incc; \
|
||||||
|
inc_t ldc11 = ldc; \
|
||||||
|
\
|
||||||
|
f_cxc \
|
||||||
|
( \
|
||||||
|
strucc, \
|
||||||
|
diagc, \
|
||||||
|
uploc, \
|
||||||
|
conjc11, \
|
||||||
|
schema, \
|
||||||
|
invdiag, \
|
||||||
|
p11_dim, \
|
||||||
|
p11_len_max, \
|
||||||
|
kappa, \
|
||||||
|
c11, incc11, ldc11, \
|
||||||
|
p11, ldp, \
|
||||||
|
cntx \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Pack to p12. */ \
|
||||||
|
if ( diagoffc + panel_dim < panel_len ) \
|
||||||
|
{ \
|
||||||
|
dim_t i = bli_max( 0, diagoffc + panel_dim ); \
|
||||||
|
dim_t p12_dim = panel_dim; \
|
||||||
|
dim_t p12_len = panel_len - i; \
|
||||||
|
/* If we are packing p12, then it is always the last partial block \
|
||||||
|
and so we should make sure to pad with zeros if necessary. */ \
|
||||||
|
dim_t p12_len_max = p12_len + panel_len_pad; \
|
||||||
|
ctype* p12 = p + i * ldp; \
|
||||||
|
conj_t conjc12 = conjc; \
|
||||||
|
ctype* c12 = c + i * ldc; \
|
||||||
|
inc_t incc12 = incc; \
|
||||||
|
inc_t ldc12 = ldc; \
|
||||||
|
\
|
||||||
|
if ( bli_is_lower( uploc ) ) \
|
||||||
|
{ \
|
||||||
|
bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \
|
||||||
|
\
|
||||||
|
if ( bli_is_hermitian( strucc ) ) \
|
||||||
|
bli_toggle_conj( &conjc12 ); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* If we are referencing the unstored part of a triangular matrix,
|
||||||
|
explicitly store zeros */ \
|
||||||
|
if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \
|
||||||
|
{ \
|
||||||
|
if ( bli_is_1m_packed( schema ) ) \
|
||||||
|
{ \
|
||||||
|
ctype_r* restrict zero = PASTEMAC(chr,0); \
|
||||||
|
\
|
||||||
|
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
|
||||||
|
( \
|
||||||
|
BLIS_NO_CONJUGATE, \
|
||||||
|
0, \
|
||||||
|
BLIS_NONUNIT_DIAG, \
|
||||||
|
BLIS_DENSE, \
|
||||||
|
packmrnr_r, \
|
||||||
|
p12_len_max * 2, \
|
||||||
|
zero, \
|
||||||
|
( ctype_r* )p12, 1, ldp, \
|
||||||
|
cntx, \
|
||||||
|
NULL \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||||
|
\
|
||||||
|
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||||
|
( \
|
||||||
|
BLIS_NO_CONJUGATE, \
|
||||||
|
0, \
|
||||||
|
BLIS_NONUNIT_DIAG, \
|
||||||
|
BLIS_DENSE, \
|
||||||
|
packmrnr, \
|
||||||
|
p12_len_max, \
|
||||||
|
zero, \
|
||||||
|
p12, 1, ldp, \
|
||||||
|
cntx, \
|
||||||
|
NULL \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
f_cxk \
|
||||||
|
( \
|
||||||
|
conjc12, \
|
||||||
|
schema, \
|
||||||
|
p12_dim, \
|
||||||
|
p12_len, \
|
||||||
|
p12_len_max, \
|
||||||
|
kappa, \
|
||||||
|
c12, incc12, ldc12, \
|
||||||
|
p12, ldp, \
|
||||||
|
cntx \
|
||||||
); \
|
); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
|
INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#undef GENTFUNC
|
|
||||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
|
||||||
\
|
|
||||||
void PASTEMAC(ch,varname) \
|
|
||||||
( \
|
|
||||||
struc_t strucc, \
|
|
||||||
diag_t diagc, \
|
|
||||||
uplo_t uploc, \
|
|
||||||
conj_t conjc, \
|
|
||||||
pack_t schema, \
|
|
||||||
bool invdiag, \
|
|
||||||
dim_t panel_dim, \
|
|
||||||
dim_t panel_len, \
|
|
||||||
dim_t panel_dim_max, \
|
|
||||||
dim_t panel_len_max, \
|
|
||||||
dim_t panel_dim_off, \
|
|
||||||
dim_t panel_len_off, \
|
|
||||||
ctype* restrict kappa, \
|
|
||||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
|
||||||
ctype* restrict p, inc_t ldp, \
|
|
||||||
inc_t is_p, \
|
|
||||||
cntx_t* cntx \
|
|
||||||
) \
|
|
||||||
{ \
|
|
||||||
doff_t diagoffc = panel_dim_off - panel_len_off; \
|
|
||||||
\
|
|
||||||
/* Pack the panel. */ \
|
|
||||||
PASTEMAC(ch,kername) \
|
|
||||||
( \
|
|
||||||
conjc, \
|
|
||||||
schema, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_dim_max, \
|
|
||||||
panel_len, \
|
|
||||||
panel_len_max, \
|
|
||||||
kappa, \
|
|
||||||
c, incc, ldc, \
|
|
||||||
p, ldp, \
|
|
||||||
cntx \
|
|
||||||
); \
|
|
||||||
\
|
|
||||||
\
|
|
||||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
|
||||||
the diagonal of the packed panel to kappa. */ \
|
|
||||||
if ( bli_is_unit_diag( diagc ) ) \
|
|
||||||
{ \
|
|
||||||
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
|
|
||||||
( \
|
|
||||||
BLIS_NO_CONJUGATE, \
|
|
||||||
diagoffc, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_len, \
|
|
||||||
kappa, \
|
|
||||||
p, 1, ldp, \
|
|
||||||
cntx, \
|
|
||||||
NULL \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
/* If requested, invert the diagonal of the packed panel. */ \
|
|
||||||
if ( invdiag == TRUE ) \
|
|
||||||
{ \
|
|
||||||
PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
|
|
||||||
( \
|
|
||||||
diagoffc, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_len, \
|
|
||||||
p, 1, ldp, \
|
|
||||||
cntx, \
|
|
||||||
NULL \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
|
||||||
we need to reference the "unstored" region on the other side of
|
|
||||||
the diagonal. This amounts to toggling uploc and then shifting
|
|
||||||
the diagonal offset to shrink the newly referenced region (by
|
|
||||||
one diagonal). Note that this zero-filling is not needed for
|
|
||||||
trsm, since the unstored region is not referenced by the trsm
|
|
||||||
micro-kernel; however, zero-filling is needed for trmm, which
|
|
||||||
uses the gemm micro-kernel.*/ \
|
|
||||||
{ \
|
|
||||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
|
||||||
uplo_t uplop = uploc; \
|
|
||||||
\
|
|
||||||
bli_toggle_uplo( &uplop ); \
|
|
||||||
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
|
|
||||||
\
|
|
||||||
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
|
||||||
( \
|
|
||||||
BLIS_NO_CONJUGATE, \
|
|
||||||
diagoffc, \
|
|
||||||
BLIS_NONUNIT_DIAG, \
|
|
||||||
uplop, \
|
|
||||||
panel_dim, \
|
|
||||||
panel_len, \
|
|
||||||
zero, \
|
|
||||||
p, 1, ldp, \
|
|
||||||
cntx, \
|
|
||||||
NULL \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
/* If this panel is an edge case in both panel dimension and length,
|
|
||||||
then it must be a bottom-right corner case. Set the part of the
|
|
||||||
diagonal that extends into the zero-padded region to identity.
|
|
||||||
NOTE: This is actually only necessary when packing for trsm, as
|
|
||||||
it helps prevent NaNs and Infs from creeping into the computation.
|
|
||||||
However, we set the region to identity for trmm as well. Those
|
|
||||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
|
||||||
region of the other matrix, so there is no harm in this. */ \
|
|
||||||
if ( panel_dim != panel_dim_max && \
|
|
||||||
panel_len != panel_len_max ) \
|
|
||||||
{ \
|
|
||||||
ctype* restrict one = PASTEMAC(ch,1); \
|
|
||||||
dim_t i = panel_dim; \
|
|
||||||
dim_t j = panel_len; \
|
|
||||||
dim_t m_br = panel_dim_max - i; \
|
|
||||||
dim_t n_br = panel_len_max - j; \
|
|
||||||
ctype* p_br = p + (i ) + (j )*ldp; \
|
|
||||||
\
|
|
||||||
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
|
|
||||||
( \
|
|
||||||
BLIS_NO_CONJUGATE, \
|
|
||||||
0, \
|
|
||||||
m_br, \
|
|
||||||
n_br, \
|
|
||||||
one, \
|
|
||||||
p_br, 1, ldp, \
|
|
||||||
cntx, \
|
|
||||||
NULL \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
|
|
||||||
|
|
||||||
|
|||||||
@@ -37,5 +37,3 @@
|
|||||||
#include "bli_unpackm_int.h"
|
#include "bli_unpackm_int.h"
|
||||||
|
|
||||||
#include "bli_unpackm_blk_var1.h"
|
#include "bli_unpackm_blk_var1.h"
|
||||||
|
|
||||||
#include "bli_unpackm_cxk.h"
|
|
||||||
|
|||||||
@@ -36,21 +36,22 @@
|
|||||||
|
|
||||||
#define FUNCPTR_T unpackm_fp
|
#define FUNCPTR_T unpackm_fp
|
||||||
|
|
||||||
typedef void (*FUNCPTR_T)(
|
typedef void (*FUNCPTR_T)
|
||||||
struc_t strucc,
|
(
|
||||||
doff_t diagoffc,
|
struc_t strucc,
|
||||||
diag_t diagc,
|
doff_t diagoffc,
|
||||||
uplo_t uploc,
|
diag_t diagc,
|
||||||
trans_t transc,
|
uplo_t uploc,
|
||||||
dim_t m,
|
trans_t transc,
|
||||||
dim_t n,
|
dim_t m,
|
||||||
dim_t m_panel,
|
dim_t n,
|
||||||
dim_t n_panel,
|
dim_t m_panel,
|
||||||
void* p, inc_t rs_p, inc_t cs_p,
|
dim_t n_panel,
|
||||||
dim_t pd_p, inc_t ps_p,
|
void* p, inc_t rs_p, inc_t cs_p,
|
||||||
void* c, inc_t rs_c, inc_t cs_c,
|
dim_t pd_p, inc_t ps_p,
|
||||||
cntx_t* cntx
|
void* c, inc_t rs_c, inc_t cs_c,
|
||||||
);
|
cntx_t* cntx
|
||||||
|
);
|
||||||
|
|
||||||
static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
|
static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
|
||||||
|
|
||||||
@@ -152,10 +153,10 @@ void PASTEMAC(ch,varname) \
|
|||||||
dim_t iter_dim; \
|
dim_t iter_dim; \
|
||||||
dim_t num_iter; \
|
dim_t num_iter; \
|
||||||
dim_t it, ic, ip; \
|
dim_t it, ic, ip; \
|
||||||
dim_t ic0, ip0; \
|
dim_t ic0, ip0; \
|
||||||
doff_t ic_inc, ip_inc; \
|
doff_t ic_inc, ip_inc; \
|
||||||
doff_t diagoffc_i; \
|
doff_t diagoffc_i; \
|
||||||
doff_t diagoffc_inc; \
|
doff_t diagoffc_inc; \
|
||||||
dim_t panel_len; \
|
dim_t panel_len; \
|
||||||
dim_t panel_dim_i; \
|
dim_t panel_dim_i; \
|
||||||
dim_t panel_dim_max; \
|
dim_t panel_dim_max; \
|
||||||
@@ -164,6 +165,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
inc_t ldp; \
|
inc_t ldp; \
|
||||||
dim_t* m_panel_full; \
|
dim_t* m_panel_full; \
|
||||||
dim_t* n_panel_full; \
|
dim_t* n_panel_full; \
|
||||||
|
pack_t schema; \
|
||||||
\
|
\
|
||||||
\
|
\
|
||||||
/* If c needs a transposition, induce it so that we can more simply
|
/* If c needs a transposition, induce it so that we can more simply
|
||||||
@@ -182,6 +184,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
|
||||||
{ \
|
{ \
|
||||||
/* Prepare to unpack from column panels. */ \
|
/* Prepare to unpack from column panels. */ \
|
||||||
|
schema = BLIS_PACKED_COL_PANELS; \
|
||||||
iter_dim = n; \
|
iter_dim = n; \
|
||||||
panel_len = m; \
|
panel_len = m; \
|
||||||
panel_dim_max = pd_p; \
|
panel_dim_max = pd_p; \
|
||||||
@@ -196,6 +199,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
|
||||||
{ \
|
{ \
|
||||||
/* Prepare to unpack from row panels. */ \
|
/* Prepare to unpack from row panels. */ \
|
||||||
|
schema = BLIS_PACKED_ROW_PANELS; \
|
||||||
iter_dim = m; \
|
iter_dim = m; \
|
||||||
panel_len = n; \
|
panel_len = n; \
|
||||||
panel_dim_max = pd_p; \
|
panel_dim_max = pd_p; \
|
||||||
@@ -207,6 +211,14 @@ void PASTEMAC(ch,varname) \
|
|||||||
m_panel_full = &panel_dim_i; \
|
m_panel_full = &panel_dim_i; \
|
||||||
n_panel_full = &n; \
|
n_panel_full = &n; \
|
||||||
} \
|
} \
|
||||||
|
\
|
||||||
|
num_t dt = PASTEMAC(ch,type); \
|
||||||
|
ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
|
||||||
|
: BLIS_UNPACKM_MRXK_KER; \
|
||||||
|
\
|
||||||
|
/* Query the context for the unpackm kernel corresponding to the current
|
||||||
|
panel dimension, or kernel id. */ \
|
||||||
|
PASTECH2(ch,unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
|
||||||
\
|
\
|
||||||
/* Compute the total number of iterations we'll need. */ \
|
/* Compute the total number of iterations we'll need. */ \
|
||||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||||
@@ -253,9 +265,10 @@ void PASTEMAC(ch,varname) \
|
|||||||
else \
|
else \
|
||||||
{ \
|
{ \
|
||||||
/* Pack the current panel. */ \
|
/* Pack the current panel. */ \
|
||||||
PASTEMAC(ch,unpackm_cxk) \
|
f \
|
||||||
( \
|
( \
|
||||||
BLIS_NO_CONJUGATE, \
|
BLIS_NO_CONJUGATE, \
|
||||||
|
schema, \
|
||||||
panel_dim_i, \
|
panel_dim_i, \
|
||||||
panel_len, \
|
panel_len, \
|
||||||
one, \
|
one, \
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < n_iter; ++i ) \
|
for ( i = 0; i < n_iter; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < n_iter; ++i ) \
|
for ( i = 0; i < n_iter; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotxf_ker_ft) kfp_df; \
|
PASTECH(ch,dotxf_ker_ft) kfp_df; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
|
kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
|
||||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
|
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < n_iter; i += f ) \
|
for ( i = 0; i < n_iter; i += f ) \
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyf_ker_ft) kfp_af; \
|
PASTECH(ch,axpyf_ker_ft) kfp_af; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||||
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
|
kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
|
||||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
|
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < n_iter; i += f ) \
|
for ( i = 0; i < n_iter; i += f ) \
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( j = 0; j < n; ++j ) \
|
for ( j = 0; j < n; ++j ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointers. */ \
|
/* Query the context for the kernel function pointers. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointers. */ \
|
/* Query the context for the kernel function pointers. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointers. */ \
|
/* Query the context for the kernel function pointers. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
|
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||||
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
|
kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
|
||||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
|
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; i += f ) \
|
for ( i = 0; i < m; i += f ) \
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
|
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
|
kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
|
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||||
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
|
kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
|
||||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
|
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; i += f ) \
|
for ( i = 0; i < m; i += f ) \
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
|
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
|
kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \
|
|||||||
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
PASTECH(ch,axpyv_ker_ft) kfp_av; \
|
||||||
\
|
\
|
||||||
/* Query the context for the kernel function pointer. */ \
|
/* Query the context for the kernel function pointer. */ \
|
||||||
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
|
||||||
\
|
\
|
||||||
for ( i = 0; i < m; ++i ) \
|
for ( i = 0; i < m; ++i ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user